From a440943e68cd1b5a853a6f60865967b7cc2539eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Sep 2021 08:59:58 +0200 Subject: unicode: remove the charset field from struct unicode_map It is hardcoded and only used for a f2fs sysfs file where it can be hardcoded just as easily. Signed-off-by: Christoph Hellwig Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Gabriel Krisman Bertazi --- include/linux/unicode.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/unicode.h b/include/linux/unicode.h index 74484d44c755..6a392cd9f076 100644 --- a/include/linux/unicode.h +++ b/include/linux/unicode.h @@ -6,7 +6,6 @@ #include struct unicode_map { - const char *charset; int version; }; -- cgit v1.2.3 From f3a9c82396006a5664f6e398d6928799d29de76e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Sep 2021 08:59:59 +0200 Subject: unicode: mark the version field in struct unicode_map unsigned unicode version tripplets are always unsigned. Signed-off-by: Christoph Hellwig Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Gabriel Krisman Bertazi --- include/linux/unicode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/unicode.h b/include/linux/unicode.h index 6a392cd9f076..0744f81c4b5f 100644 --- a/include/linux/unicode.h +++ b/include/linux/unicode.h @@ -6,7 +6,7 @@ #include struct unicode_map { - int version; + unsigned int version; }; int utf8_validate(const struct unicode_map *um, const struct qstr *str); -- cgit v1.2.3 From 49bd03cc7e95cb78420305ca2f5ef67497b6fa80 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Sep 2021 09:00:00 +0200 Subject: unicode: pass a UNICODE_AGE() tripple to utf8_load Don't bother with pointless string parsing when the caller can just pass the version in the format that the core expects. Also remove the fallback to the latest version that none of the callers actually uses. Signed-off-by: Christoph Hellwig Signed-off-by: Gabriel Krisman Bertazi --- include/linux/unicode.h | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/unicode.h b/include/linux/unicode.h index 0744f81c4b5f..77bb915fd1f0 100644 --- a/include/linux/unicode.h +++ b/include/linux/unicode.h @@ -5,6 +5,29 @@ #include #include +#define UNICODE_MAJ_SHIFT 16 +#define UNICODE_MIN_SHIFT 8 + +#define UNICODE_AGE(MAJ, MIN, REV) \ + (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \ + ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \ + ((unsigned int)(REV))) + +static inline u8 unicode_major(unsigned int age) +{ + return (age >> UNICODE_MAJ_SHIFT) & 0xff; +} + +static inline u8 unicode_minor(unsigned int age) +{ + return (age >> UNICODE_MIN_SHIFT) & 0xff; +} + +static inline u8 unicode_rev(unsigned int age) +{ + return age & 0xff; +} + struct unicode_map { unsigned int version; }; @@ -29,7 +52,7 @@ int utf8_casefold(const struct unicode_map *um, const struct qstr *str, int utf8_casefold_hash(const struct unicode_map *um, const void *salt, struct qstr *str); -struct unicode_map *utf8_load(const char *version); +struct unicode_map *utf8_load(unsigned int version); void utf8_unload(struct unicode_map *um); #endif /* _LINUX_UNICODE_H */ -- cgit v1.2.3 From 6ca99ce756c27852d1ea1e555045de1c920f30ed Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Sep 2021 09:00:04 +0200 Subject: unicode: cache the normalization tables in struct unicode_map Instead of repeatedly looking up the version add pointers to the NFD and NFD+CF tables to struct unicode_map, and pass a unicode_map plus index to the functions using the normalization tables. Signed-off-by: Christoph Hellwig Signed-off-by: Gabriel Krisman Bertazi --- include/linux/unicode.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/unicode.h b/include/linux/unicode.h index 77bb915fd1f0..526ca8b8391a 100644 --- a/include/linux/unicode.h +++ b/include/linux/unicode.h @@ -5,6 +5,8 @@ #include #include +struct utf8data; + #define UNICODE_MAJ_SHIFT 16 #define UNICODE_MIN_SHIFT 8 @@ -28,8 +30,25 @@ static inline u8 unicode_rev(unsigned int age) return age & 0xff; } +/* + * Two normalization forms are supported: + * 1) NFDI + * - Apply unicode normalization form NFD. + * - Remove any Default_Ignorable_Code_Point. + * 2) NFDICF + * - Apply unicode normalization form NFD. + * - Remove any Default_Ignorable_Code_Point. + * - Apply a full casefold (C + F). + */ +enum utf8_normalization { + UTF8_NFDI = 0, + UTF8_NFDICF, + UTF8_NMAX, +}; + struct unicode_map { unsigned int version; + const struct utf8data *ntab[UTF8_NMAX]; }; int utf8_validate(const struct unicode_map *um, const struct qstr *str); -- cgit v1.2.3 From 2b3d047870120bcd46d7cc257d19ff49328fd585 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Sep 2021 09:00:05 +0200 Subject: unicode: Add utf8-data module utf8data.h contains a large database table which is an auto-generated decodification trie for the unicode normalization functions. Allow building it into a separate module. Based on a patch from Shreeya Patel . Signed-off-by: Christoph Hellwig Signed-off-by: Gabriel Krisman Bertazi --- include/linux/unicode.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/unicode.h b/include/linux/unicode.h index 526ca8b8391a..4d39e6e11a95 100644 --- a/include/linux/unicode.h +++ b/include/linux/unicode.h @@ -6,6 +6,7 @@ #include struct utf8data; +struct utf8data_table; #define UNICODE_MAJ_SHIFT 16 #define UNICODE_MIN_SHIFT 8 @@ -49,6 +50,7 @@ enum utf8_normalization { struct unicode_map { unsigned int version; const struct utf8data *ntab[UTF8_NMAX]; + const struct utf8data_table *tables; }; int utf8_validate(const struct unicode_map *um, const struct qstr *str); -- cgit v1.2.3 From dd66f56caea6bb1a3703fb3bfc3106444d05a930 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 21 Oct 2021 08:55:24 +0200 Subject: dma-buf: fix kerneldoc for renamed members MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those members where renamed, update the kerneldoc as well. Signed-off-by: Christian König Reviewed-by: Alex Deucher Acked-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/20211021141945.84023-1-christian.koenig@amd.com --- include/linux/dma-buf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 02c2eb874da6..9807aef33685 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -433,8 +433,8 @@ struct dma_buf { /** @poll: for userspace poll support */ wait_queue_head_t poll; - /** @cb_excl: for userspace poll support */ - /** @cb_shared: for userspace poll support */ + /** @cb_in: for userspace poll support */ + /** @cb_out: for userspace poll support */ struct dma_buf_poll_cb_t { struct dma_fence_cb cb; wait_queue_head_t *poll; -- cgit v1.2.3 From 1705643faecde95bdeb11bea5ab5baed084e9f91 Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Tue, 19 Oct 2021 05:30:20 +0800 Subject: mmc: add MT7921 SDIO identifiers for MediaTek Bluetooth devices The MT7961 SDIO identifier for MediaTek Bluetooth devices were being referred in the MediaTek Bluetooth driver. Co-developed-by: Mark-yw Chen Signed-off-by: Mark-yw Chen Signed-off-by: Sean Wang Signed-off-by: Marcel Holtmann --- include/linux/mmc/sdio_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index a85c9f0bd470..53f0efa0bccf 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -105,6 +105,7 @@ #define SDIO_VENDOR_ID_MEDIATEK 0x037a #define SDIO_DEVICE_ID_MEDIATEK_MT7663 0x7663 #define SDIO_DEVICE_ID_MEDIATEK_MT7668 0x7668 +#define SDIO_DEVICE_ID_MEDIATEK_MT7961 0x7961 #define SDIO_VENDOR_ID_MICROCHIP_WILC 0x0296 #define SDIO_DEVICE_ID_MICROCHIP_WILC1000 0x5347 -- cgit v1.2.3 From 3ab7b6ac5d829e60c3b89d415811ff1c9f358c8e Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Mon, 25 Oct 2021 10:09:23 -0700 Subject: pwm: Introduce single-PWM of_xlate function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing pxa driver and the upcoming addition of PWM support in the TI sn565dsi86 DSI/eDP bridge driver both has a single PWM channel and thereby a need for a of_xlate function with the period as its single argument. Introduce a common helper function in the core that can be used as of_xlate by such drivers and migrate the pxa driver to use this. Signed-off-by: Bjorn Andersson Acked-by: Uwe Kleine-König Tested-by: Steev Klimaszewski Tested-By: Steev Klimaszewski Signed-off-by: Robert Foss Link: https://patchwork.freedesktop.org/patch/msgid/20211025170925.3096444-1-bjorn.andersson@linaro.org --- include/linux/pwm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 725c9b784e60..dd51d4931fdc 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -414,6 +414,8 @@ struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip, struct pwm_device *of_pwm_xlate_with_flags(struct pwm_chip *pc, const struct of_phandle_args *args); +struct pwm_device *of_pwm_single_xlate(struct pwm_chip *pc, + const struct of_phandle_args *args); struct pwm_device *pwm_get(struct device *dev, const char *con_id); struct pwm_device *of_pwm_get(struct device *dev, struct device_node *np, -- cgit v1.2.3 From 7c7e3d31e7856a8260a254f8c71db416f7f9f5a1 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 5 Nov 2021 16:23:29 -0700 Subject: bpf: Introduce helper bpf_find_vma In some profiler use cases, it is necessary to map an address to the backing file, e.g., a shared library. bpf_find_vma helper provides a flexible way to achieve this. bpf_find_vma maps an address of a task to the vma (vm_area_struct) for this address, and feed the vma to an callback BPF function. The callback function is necessary here, as we need to ensure mmap_sem is unlocked. It is necessary to lock mmap_sem for find_vma. To lock and unlock mmap_sem safely when irqs are disable, we use the same mechanism as stackmap with build_id. Specifically, when irqs are disabled, the unlocked is postponed in an irq_work. Refactor stackmap.c so that the irq_work is shared among bpf_find_vma and stackmap helpers. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Tested-by: Hengqi Chen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20211105232330.1936330-2-songliubraving@fb.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2be6dfd68df9..df3410bff4b0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2157,6 +2157,7 @@ extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto; +extern const struct bpf_func_proto bpf_find_vma_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); -- cgit v1.2.3 From fa443bc3c1e4b28d9315dea882e8358ba6e26f8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 29 Oct 2021 17:28:56 +0200 Subject: HID: intel-ish-hid: add support for MODULE_DEVICE_TABLE() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows to selectively autoload drivers for ISH devices. Currently all ISH drivers are loaded for all systems having any ISH device. Signed-off-by: Thomas Weißschuh Acked-by: Srinivas Pandruvada Acked-by: Hans de Goede Signed-off-by: Jiri Kosina --- include/linux/mod_devicetable.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index ae2e75d15b21..befbf53c4b7c 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -895,4 +895,17 @@ struct dfl_device_id { kernel_ulong_t driver_data; }; +/* ISHTP (Integrated Sensor Hub Transport Protocol) */ + +#define ISHTP_MODULE_PREFIX "ishtp:" + +/** + * struct ishtp_device_id - ISHTP device identifier + * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba + * @context: pointer to driver specific data + */ +struct ishtp_device_id { + guid_t guid; +}; + #endif /* LINUX_MOD_DEVICETABLE_H */ -- cgit v1.2.3 From 64355db3caf6468dc711995239efe0cbcd7d0091 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 10 Nov 2021 13:16:55 +0100 Subject: mod_devicetable: fix kdocs for ishtp_device_id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kdocs were copied from another device_id struct and not adapted. Fixes: fa443bc3c1e4 ("HID: intel-ish-hid: add support for MODULE_DEVICE_TABLE()") Signed-off-by: Thomas Weißschuh Reported-by: Stephen Rothwell Signed-off-by: Jiri Kosina --- include/linux/mod_devicetable.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index befbf53c4b7c..c70abe7aaef2 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -901,8 +901,7 @@ struct dfl_device_id { /** * struct ishtp_device_id - ISHTP device identifier - * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba - * @context: pointer to driver specific data + * @guid: GUID of the device. */ struct ishtp_device_id { guid_t guid; -- cgit v1.2.3 From 5d5e4522a7f404d1a96fd6c703989d32a9c9568d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 7 Nov 2021 14:51:16 +1000 Subject: printk: restore flushing of NMI buffers on remote CPUs after NMI backtraces printk from NMI context relies on irq work being raised on the local CPU to print to console. This can be a problem if the NMI was raised by a lockup detector to print lockup stack and regs, because the CPU may not enable irqs (because it is locked up). Introduce printk_trigger_flush() that can be called another CPU to try to get those messages to the console, call that where printk_safe_flush was previously called. Fixes: 93d102f094be ("printk: remove safe buffers") Cc: stable@vger.kernel.org # 5.15 Signed-off-by: Nicholas Piggin Reviewed-by: Petr Mladek Reviewed-by: John Ogness Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20211107045116.1754411-1-npiggin@gmail.com --- include/linux/printk.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index a1379df43251..596ad6fa0336 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -206,6 +206,7 @@ void dump_stack_print_info(const char *log_lvl); void show_regs_print_info(const char *log_lvl); extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold; extern asmlinkage void dump_stack(void) __cold; +void printk_trigger_flush(void); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) @@ -282,6 +283,9 @@ static inline void dump_stack_lvl(const char *log_lvl) static inline void dump_stack(void) { } +static inline void printk_trigger_flush(void) +{ +} #endif #ifdef CONFIG_SMP -- cgit v1.2.3 From f89315650ba34ec6c91a8bded72796980bee2a4d Mon Sep 17 00:00:00 2001 From: Mark Pashmfouroush Date: Wed, 10 Nov 2021 11:10:15 +0000 Subject: bpf: Add ingress_ifindex to bpf_sk_lookup It may be helpful to have access to the ifindex during bpf socket lookup. An example may be to scope certain socket lookup logic to specific interfaces, i.e. an interface may be made exempt from custom lookup code. Add the ifindex of the arriving connection to the bpf_sk_lookup API. Signed-off-by: Mark Pashmfouroush Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211110111016.5670-2-markpash@cloudflare.com --- include/linux/filter.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 24b7ed2677af..b6a216eb217a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1374,6 +1374,7 @@ struct bpf_sk_lookup_kern { const struct in6_addr *daddr; } v6; struct sock *selected_sk; + u32 ingress_ifindex; bool no_reuseport; }; @@ -1436,7 +1437,7 @@ extern struct static_key_false bpf_sk_lookup_enabled; static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 dport, - struct sock **psk) + const int ifindex, struct sock **psk) { struct bpf_prog_array *run_array; struct sock *selected_sk = NULL; @@ -1452,6 +1453,7 @@ static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol, .v4.daddr = daddr, .sport = sport, .dport = dport, + .ingress_ifindex = ifindex, }; u32 act; @@ -1474,7 +1476,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, const __be16 sport, const struct in6_addr *daddr, const u16 dport, - struct sock **psk) + const int ifindex, struct sock **psk) { struct bpf_prog_array *run_array; struct sock *selected_sk = NULL; @@ -1490,6 +1492,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, .v6.daddr = daddr, .sport = sport, .dport = dport, + .ingress_ifindex = ifindex, }; u32 act; -- cgit v1.2.3 From a25efb3863d068929f0bbeb87a995df11507e691 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 23 Sep 2021 13:57:42 +0200 Subject: dma-buf: add dma_fence_describe and dma_resv_describe v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add functions to dump dma_fence and dma_resv objects into a seq_file and use them for printing the debugfs information. v2: fix missing include reported by test robot. Signed-off-by: Christian König Reviewed-by: Rob Clark Link: https://patchwork.freedesktop.org/patch/msgid/20211103081231.18578-2-christian.koenig@amd.com --- include/linux/dma-fence.h | 1 + include/linux/dma-resv.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index a706b7bf51d7..1ea691753bd3 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -264,6 +264,7 @@ void dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops, void dma_fence_release(struct kref *kref); void dma_fence_free(struct dma_fence *fence); +void dma_fence_describe(struct dma_fence *fence, struct seq_file *seq); /** * dma_fence_put - decreases refcount of the fence diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h index dbd235ab447f..09c6063b199a 100644 --- a/include/linux/dma-resv.h +++ b/include/linux/dma-resv.h @@ -490,5 +490,6 @@ int dma_resv_copy_fences(struct dma_resv *dst, struct dma_resv *src); long dma_resv_wait_timeout(struct dma_resv *obj, bool wait_all, bool intr, unsigned long timeout); bool dma_resv_test_signaled(struct dma_resv *obj, bool test_all); +void dma_resv_describe(struct dma_resv *obj, struct seq_file *seq); #endif /* _LINUX_RESERVATION_H */ -- cgit v1.2.3 From bf9167a8b40c9cf463521da05342db81808c1b6e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 11 Nov 2021 09:56:33 +0100 Subject: HID: intel-ish-hid: fix module device-id handling A late addititon to the intel-ish-hid framework caused a build failure with clang, and introduced an ABI to the module loader that stops working if any driver ever needs to bind to more than one UUID: drivers/hid/intel-ish-hid/ishtp-fw-loader.c:1067:4: error: initializer element is not a compile-time constant Change the ishtp_device_id to have correct documentation and a driver_data field like all the other ones, and change the drivers to use the ID table as the primary identification in a way that works with all compilers and avoids duplciating the identifiers. Fixes: f155dfeaa4ee ("platform/x86: isthp_eclite: only load for matching devices") Fixes: facfe0a4fdce ("platform/chrome: chros_ec_ishtp: only load for matching devices") Fixes: 0d0cccc0fd83 ("HID: intel-ish-hid: hid-client: only load for matching devices") Fixes: 44e2a58cb880 ("HID: intel-ish-hid: fw-loader: only load for matching devices") Fixes: cb1a2c6847f7 ("HID: intel-ish-hid: use constants for modaliases") Fixes: fa443bc3c1e4 ("HID: intel-ish-hid: add support for MODULE_DEVICE_TABLE()") Signed-off-by: Arnd Bergmann Reviewed-by: Hans de Goede [jkosina@suse.cz: fix ecl_ishtp_cl_driver.id initialization] [jkosina@suse.cz: fix conflict with already fixed kerneldoc] Signed-off-by: Jiri Kosina --- include/linux/intel-ish-client-if.h | 4 ++-- include/linux/mod_devicetable.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h index aee8ff4739b1..f45f13304add 100644 --- a/include/linux/intel-ish-client-if.h +++ b/include/linux/intel-ish-client-if.h @@ -9,7 +9,7 @@ #define _INTEL_ISH_CLIENT_IF_H_ #include -#include +#include struct ishtp_cl_device; struct ishtp_device; @@ -40,7 +40,7 @@ enum cl_state { struct ishtp_cl_driver { struct device_driver driver; const char *name; - const guid_t *guid; + const struct ishtp_device_id *id; int (*probe)(struct ishtp_cl_device *dev); void (*remove)(struct ishtp_cl_device *dev); int (*reset)(struct ishtp_cl_device *dev); diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index c70abe7aaef2..4bb71979a8fd 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -902,9 +902,11 @@ struct dfl_device_id { /** * struct ishtp_device_id - ISHTP device identifier * @guid: GUID of the device. + * @driver_data: pointer to driver specific data */ struct ishtp_device_id { guid_t guid; + kernel_ulong_t driver_data; }; #endif /* LINUX_MOD_DEVICETABLE_H */ -- cgit v1.2.3 From 9e2ad638ae3632ef916ceb39f70e3104bf8fdc97 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 12 Nov 2021 07:02:42 -0800 Subject: bpf: Extend BTF_ID_LIST_GLOBAL with parameter for number of IDs syzbot reported the following BUG w/o CONFIG_DEBUG_INFO_BTF BUG: KASAN: global-out-of-bounds in task_iter_init+0x212/0x2e7 kernel/bpf/task_iter.c:661 Read of size 4 at addr ffffffff90297404 by task swapper/0/1 CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.15.0-syzkaller #0 Hardware name: ... Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0xf/0x309 mm/kasan/report.c:256 __kasan_report mm/kasan/report.c:442 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:459 task_iter_init+0x212/0x2e7 kernel/bpf/task_iter.c:661 do_one_initcall+0x103/0x650 init/main.c:1295 do_initcall_level init/main.c:1368 [inline] do_initcalls init/main.c:1384 [inline] do_basic_setup init/main.c:1403 [inline] kernel_init_freeable+0x6b1/0x73a init/main.c:1606 kernel_init+0x1a/0x1d0 init/main.c:1497 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 This is caused by hard-coded name[1] in BTF_ID_LIST_GLOBAL (w/o CONFIG_DEBUG_INFO_BTF). Fix this by adding a parameter n to BTF_ID_LIST_GLOBAL. This avoids ifdef CONFIG_DEBUG_INFO_BTF in btf.c and filter.c. Fixes: 7c7e3d31e785 ("bpf: Introduce helper bpf_find_vma") Reported-by: syzbot+e0d81ec552a21d9071aa@syzkaller.appspotmail.com Reported-by: Eric Dumazet Suggested-by: Eric Dumazet Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20211112150243.1270987-2-songliubraving@fb.com --- include/linux/btf_ids.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 47d9abfbdb55..6bb42b785293 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -73,7 +73,7 @@ asm( \ __BTF_ID_LIST(name, local) \ extern u32 name[]; -#define BTF_ID_LIST_GLOBAL(name) \ +#define BTF_ID_LIST_GLOBAL(name, n) \ __BTF_ID_LIST(name, globl) /* The BTF_ID_LIST_SINGLE macro defines a BTF_ID_LIST with @@ -83,7 +83,7 @@ __BTF_ID_LIST(name, globl) BTF_ID_LIST(name) \ BTF_ID(prefix, typename) #define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) \ - BTF_ID_LIST_GLOBAL(name) \ + BTF_ID_LIST_GLOBAL(name, 1) \ BTF_ID(prefix, typename) /* @@ -149,7 +149,7 @@ extern struct btf_id_set name; #define BTF_ID_LIST(name) static u32 name[5]; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED -#define BTF_ID_LIST_GLOBAL(name) u32 name[1]; +#define BTF_ID_LIST_GLOBAL(name, n) u32 name[n]; #define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; #define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 name[1]; #define BTF_SET_START(name) static struct btf_id_set name = { 0 }; -- cgit v1.2.3 From d19ddb476a539fd78ad1028ae13bb38506286931 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 12 Nov 2021 07:02:43 -0800 Subject: bpf: Introduce btf_tracing_ids Similar to btf_sock_ids, btf_tracing_ids provides btf ID for task_struct, file, and vm_area_struct via easy to understand format like btf_tracing_ids[BTF_TRACING_TYPE_[TASK|file|VMA]]. Suggested-by: Alexei Starovoitov Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20211112150243.1270987-3-songliubraving@fb.com --- include/linux/btf_ids.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 6bb42b785293..919c0fde1c51 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -189,6 +189,18 @@ MAX_BTF_SOCK_TYPE, extern u32 btf_sock_ids[]; #endif -extern u32 btf_task_struct_ids[]; +#define BTF_TRACING_TYPE_xxx \ + BTF_TRACING_TYPE(BTF_TRACING_TYPE_TASK, task_struct) \ + BTF_TRACING_TYPE(BTF_TRACING_TYPE_FILE, file) \ + BTF_TRACING_TYPE(BTF_TRACING_TYPE_VMA, vm_area_struct) + +enum { +#define BTF_TRACING_TYPE(name, type) name, +BTF_TRACING_TYPE_xxx +#undef BTF_TRACING_TYPE +MAX_BTF_TRACING_TYPE, +}; + +extern u32 btf_tracing_ids[]; #endif -- cgit v1.2.3 From 1aa3b2207e889a948049c9a8016cedb0218c2389 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 12 Nov 2021 18:18:10 -0500 Subject: net,lsm,selinux: revert the security_sctp_assoc_established() hook This patch reverts two prior patches, e7310c94024c ("security: implement sctp_assoc_established hook in selinux") and 7c2ef0240e6a ("security: add sctp_assoc_established hook"), which create the security_sctp_assoc_established() LSM hook and provide a SELinux implementation. Unfortunately these two patches were merged without proper review (the Reviewed-by and Tested-by tags from Richard Haines were for previous revisions of these patches that were significantly different) and there are outstanding objections from the SELinux maintainers regarding these patches. Work is currently ongoing to correct the problems identified in the reverted patches, as well as others that have come up during review, but it is unclear at this point in time when that work will be ready for inclusion in the mainline kernel. In the interest of not keeping objectionable code in the kernel for multiple weeks, and potentially a kernel release, we are reverting the two problematic patches. Signed-off-by: Paul Moore Signed-off-by: David S. Miller --- include/linux/lsm_hook_defs.h | 2 -- include/linux/lsm_hooks.h | 5 ----- include/linux/security.h | 7 ------- 3 files changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 442a611fa0fb..df8de62f4710 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -335,8 +335,6 @@ LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname, struct sockaddr *address, int addrlen) LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc, struct sock *sk, struct sock *newsk) -LSM_HOOK(void, LSM_RET_VOID, sctp_assoc_established, struct sctp_association *asoc, - struct sk_buff *skb) #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index d6823214d5c1..d45b6f6e27fd 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1050,11 +1050,6 @@ * @asoc pointer to current sctp association structure. * @sk pointer to current sock structure. * @newsk pointer to new sock structure. - * @sctp_assoc_established: - * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet - * to the security module. - * @asoc pointer to sctp association structure. - * @skb pointer to skbuff of association packet. * * Security hooks for Infiniband * diff --git a/include/linux/security.h b/include/linux/security.h index 06eac4e61a13..bbf44a466832 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1430,8 +1430,6 @@ int security_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen); void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk); -void security_sctp_assoc_established(struct sctp_association *asoc, - struct sk_buff *skb); #else /* CONFIG_SECURITY_NETWORK */ static inline int security_unix_stream_connect(struct sock *sock, @@ -1651,11 +1649,6 @@ static inline void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *newsk) { } - -static inline void security_sctp_assoc_established(struct sctp_association *asoc, - struct sk_buff *skb) -{ -} #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND -- cgit v1.2.3 From 938aa33f14657c9ed9deea348b7d6f14b6d69cb7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 14 Nov 2021 13:28:34 -0500 Subject: tracing: Add length protection to histogram string copies The string copies to the histogram storage has a max size of 256 bytes (defined by MAX_FILTER_STR_VAL). Only the string size of the event field needs to be copied to the event storage, but no more than what is in the event storage. Although nothing should be bigger than 256 bytes, there's no protection against overwriting of the storage if one day there is. Copy no more than the destination size, and enforce it. Also had to turn MAX_FILTER_STR_VAL into an unsigned int, to keep the min() comparison of the string sizes of comparable types. Link: https://lore.kernel.org/all/CAHk-=wjREUihCGrtRBwfX47y_KrLCGjiq3t6QtoNJpmVrAEb1w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20211114132834.183429a4@rorschach.local.home Cc: Ingo Molnar Cc: Andrew Morton Cc: Tom Zanussi Reported-by: Linus Torvalds Reviewed-by: Masami Hiramatsu Fixes: 63f84ae6b82b ("tracing/histogram: Do not copy the fixed-size char array field over the field size") Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 50453b287615..2d167ac3452c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -673,7 +673,7 @@ struct trace_event_file { #define PERF_MAX_TRACE_SIZE 8192 -#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ +#define MAX_FILTER_STR_VAL 256U /* Should handle KSYM_SYMBOL_LEN */ enum event_trigger_type { ETT_NONE = (0), -- cgit v1.2.3 From 4c7924fb905b02323ff6d9d20f370892615dccfa Mon Sep 17 00:00:00 2001 From: Julien Massot Date: Fri, 22 Oct 2021 14:21:01 +0200 Subject: soc: renesas: rcar-rst: Add support to set rproc boot address R-Car Gen3 SoC series has a realtime processor, the boot address of this processor can be set thanks to CR7BAR register of the reset module. Export this function so that it's possible to set the boot address from a remoteproc driver. Also drop the __initdata qualifier on rcar_rst_base, since we will use this address later than init time. Signed-off-by: Julien Massot Link: https://lore.kernel.org/r/20211022122101.66998-1-julien.massot@iot.bzh Signed-off-by: Geert Uytterhoeven --- include/linux/soc/renesas/rcar-rst.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/renesas/rcar-rst.h b/include/linux/soc/renesas/rcar-rst.h index 7899a5b8c247..1f1fe8bfaa76 100644 --- a/include/linux/soc/renesas/rcar-rst.h +++ b/include/linux/soc/renesas/rcar-rst.h @@ -4,8 +4,10 @@ #ifdef CONFIG_RST_RCAR int rcar_rst_read_mode_pins(u32 *mode); +int rcar_rst_set_rproc_boot_addr(u64 boot_addr); #else static inline int rcar_rst_read_mode_pins(u32 *mode) { return -ENODEV; } +static inline int rcar_rst_set_rproc_boot_addr(u64 boot_addr) { return -ENODEV; } #endif #endif /* __LINUX_SOC_RENESAS_RCAR_RST_H__ */ -- cgit v1.2.3 From 032816fbbfafe3198bb5c71fbbe4e8e5be33b352 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 27 Oct 2021 14:45:07 +0100 Subject: pinctrl: pinconf-generic: Add support for "output-impedance-ohms" to be extracted from DT files Add "output-impedance-ohms" property to generic options used for DT parsing files. This enables drivers, which use generic pin configurations, to get the value passed to this property. Signed-off-by: Lad Prabhakar Reviewed-by: Biju Das Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20211027134509.5036-3-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- include/linux/pinctrl/pinconf-generic.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index eee0e3948537..2422211d6a5a 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -91,6 +91,8 @@ struct pinctrl_map; * configuration (eg. the currently selected mux function) drive values on * the line. Use argument 1 to enable output mode, argument 0 to disable * it. + * @PIN_CONFIG_OUTPUT_IMPEDANCE_OHMS: this will configure the output impedance + * of the pin with the value passed as argument. The argument is in ohms. * @PIN_CONFIG_PERSIST_STATE: retain pin state across sleep or controller reset * @PIN_CONFIG_POWER_SOURCE: if the pin can select between different power * supplies, the argument to this parameter (on a custom format) tells @@ -129,6 +131,7 @@ enum pin_config_param { PIN_CONFIG_MODE_PWM, PIN_CONFIG_OUTPUT, PIN_CONFIG_OUTPUT_ENABLE, + PIN_CONFIG_OUTPUT_IMPEDANCE_OHMS, PIN_CONFIG_PERSIST_STATE, PIN_CONFIG_POWER_SOURCE, PIN_CONFIG_SKEW_DELAY, -- cgit v1.2.3 From 507805b83ff108473dba9d4909e41abd50cf07f5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 10 Nov 2021 15:47:42 +0200 Subject: gpiolib: acpi: Remove never used devm_acpi_dev_remove_driver_gpios() Remove never used devm_acpi_dev_remove_driver_gpios(). Signed-off-by: Andy Shevchenko Reviewed-by: Mika Westerberg --- include/linux/gpio/consumer.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 97a28ad3393b..3ad67b4a72be 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -690,7 +690,6 @@ void acpi_dev_remove_driver_gpios(struct acpi_device *adev); int devm_acpi_dev_add_driver_gpios(struct device *dev, const struct acpi_gpio_mapping *gpios); -void devm_acpi_dev_remove_driver_gpios(struct device *dev); struct gpio_desc *acpi_get_and_request_gpiod(char *path, int pin, char *label); @@ -708,7 +707,6 @@ static inline int devm_acpi_dev_add_driver_gpios(struct device *dev, { return -ENXIO; } -static inline void devm_acpi_dev_remove_driver_gpios(struct device *dev) {} #endif /* CONFIG_GPIOLIB && CONFIG_ACPI */ -- cgit v1.2.3 From 10a2308ffb8cf262e473eb324fde42ae31b6da04 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 12 Nov 2021 18:16:34 +0800 Subject: net: Clean up some inconsistent indenting Eliminate the follow smatch warning: ./include/linux/skbuff.h:4229 skb_remcsum_process() warn: inconsistent indenting. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 686a666d073d..c8cb7e697d47 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4226,7 +4226,7 @@ static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr, return; } - if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) { + if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) { __skb_checksum_complete(skb); skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data); } -- cgit v1.2.3 From 02d6fdecb9c38de19065f6bed8d5214556fd061d Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Thu, 4 Nov 2021 16:00:40 +0100 Subject: regmap: allow to define reg_update_bits for no bus configuration Some device requires a special handling for reg_update_bits and can't use the normal regmap read write logic. An example is when locking is handled by the device and rmw operations requires to do atomic operations. Allow to declare a dedicated function in regmap_config for reg_update_bits in no bus configuration. Signed-off-by: Ansuel Smith Link: https://lore.kernel.org/r/20211104150040.1260-1-ansuelsmth@gmail.com Signed-off-by: Mark Brown --- include/linux/regmap.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index e3c9a25a853a..22652e5fbc38 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -290,6 +290,11 @@ typedef void (*regmap_unlock)(void *); * read operation on a bus such as SPI, I2C, etc. Most of the * devices do not need this. * @reg_write: Same as above for writing. + * @reg_update_bits: Optional callback that if filled will be used to perform + * all the update_bits(rmw) operation. Should only be provided + * if the function require special handling with lock and reg + * handling and the operation cannot be represented as a simple + * update_bits operation on a bus such as SPI, I2C, etc. * @fast_io: Register IO is fast. Use a spinlock instead of a mutex * to perform locking. This field is ignored if custom lock/unlock * functions are used (see fields lock/unlock of struct regmap_config). @@ -372,6 +377,8 @@ struct regmap_config { int (*reg_read)(void *context, unsigned int reg, unsigned int *val); int (*reg_write)(void *context, unsigned int reg, unsigned int val); + int (*reg_update_bits)(void *context, unsigned int reg, + unsigned int mask, unsigned int val); bool fast_io; -- cgit v1.2.3 From 45971bdd8ca8b5a99a49f4db86737401c45e246f Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Tue, 2 Nov 2021 16:02:02 -0600 Subject: spi: remove unused header file Commit 6acaadc852f1 ("spi: clps711x: Driver refactor") removed the only use of , but left the header file behind. This file is unused, delete it. Cc: Signed-off-by: Arnd Bergmann Cc: Signed-off-by: Mark Brown Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Jonathan Corbet Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20211102220203.940290-9-corbet@lwn.net Signed-off-by: Mark Brown --- include/linux/platform_data/spi-clps711x.h | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 include/linux/platform_data/spi-clps711x.h (limited to 'include/linux') diff --git a/include/linux/platform_data/spi-clps711x.h b/include/linux/platform_data/spi-clps711x.h deleted file mode 100644 index efaa596848c9..000000000000 --- a/include/linux/platform_data/spi-clps711x.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * CLPS711X SPI bus driver definitions - * - * Copyright (C) 2012 Alexander Shiyan - */ - -#ifndef ____LINUX_PLATFORM_DATA_SPI_CLPS711X_H -#define ____LINUX_PLATFORM_DATA_SPI_CLPS711X_H - -/* Board specific platform_data */ -struct spi_clps711x_pdata { - int *chipselect; /* Array of GPIO-numbers */ - int num_chipselect; /* Total count of GPIOs */ -}; - -#endif -- cgit v1.2.3 From a0ddee65c527d877e798205c1391c6170e580c66 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 12 Nov 2021 16:07:49 +0200 Subject: printk: Remove printk.h inclusion in percpu.h After the commit 42a0bb3f7138 ("printk/nmi: generic solution for safe printk in NMI") the printk.h is not needed anymore in percpu.h. Moreover `make headerdep` complains (an excerpt) In file included from linux/printk.h, from linux/dynamic_debug.h:188 from linux/printk.h:559 <-- here from linux/percpu.h:9 from linux/idr.h:17 include/net/9p/client.h:13: warning: recursive header inclusion Yeah, it's not a root cause of this, but removing will help to reduce the noise. Fixes: 42a0bb3f7138 ("printk/nmi: generic solution for safe printk in NMI") Signed-off-by: Andy Shevchenko Acked-by: Dennis Zhou Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20211112140749.80042-1-andriy.shevchenko@linux.intel.com --- include/linux/percpu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 5e76af742c80..4fa3000f9c22 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From 13cae4a104d2b7205696229ba85d34cc035f8c84 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Mon, 15 Nov 2021 10:49:21 +0800 Subject: i2c: core: Allow 255 byte transfers for SMBus 3.x SMBus 3.0 increased the maximum block transfer size from 32 bytes to 255 bytes. We increase the size of struct i2c_smbus_data's block[] member. i2c_smbus_xfer() and i2c_smbus_xfer_emulated() now support 255 byte block operations, other block functions remain limited to 32 bytes for compatibility with existing callers. We allow adapters to indicate support for the larger size with I2C_FUNC_SMBUS_V3_BLOCK. Most emulated drivers should be able to use 255 byte blocks by replacing I2C_SMBUS_BLOCK_MAX with I2C_SMBUS_V3_BLOCK_MAX though some will have hardware limitations that need testing. Signed-off-by: Matt Johnston Signed-off-by: David S. Miller --- include/linux/i2c.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 16119ac1aa97..353d6b4e7a53 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -52,6 +52,19 @@ typedef int (*i2c_slave_cb_t)(struct i2c_client *client, struct module; struct property_entry; +/* SMBus 3.0 extends the maximum block read/write size to 255 (from 32). + * The larger size is only supported by some drivers, indicated by + * the I2C_FUNC_SMBUS_V3_BLOCK functionality bit. + */ +#define I2C_SMBUS_V3_BLOCK_MAX 255 /* As specified in SMBus 3.0 standard */ + +/* Note compatibility definition in uapi header with 32 byte block */ +union i2c_smbus_data { + __u8 byte; + __u16 word; + __u8 block[I2C_SMBUS_V3_BLOCK_MAX + 1]; /* block[0] is used for length */ +}; + #if IS_ENABLED(CONFIG_I2C) /* Return the Frequency mode string based on the bus frequency */ const char *i2c_freq_mode_string(u32 bus_freq_hz); -- cgit v1.2.3 From 34ae2c09d46a2d0abd907e139b466f798e4095a8 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 15 Nov 2021 10:00:27 +0000 Subject: net: phylink: add generic validate implementation Add a generic validate() implementation using the supported_interfaces and a bitmask of MAC pause/speed/duplex capabilities. This allows us to entirely eliminate many driver private validate() implementations. We expose the underlying phylink_get_linkmodes() function so that drivers which have special needs can still benefit from conversion. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phylink.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index f037470b6fb3..3563820a1765 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -20,6 +20,29 @@ enum { MLO_AN_PHY = 0, /* Conventional PHY */ MLO_AN_FIXED, /* Fixed-link mode */ MLO_AN_INBAND, /* In-band protocol */ + + MAC_SYM_PAUSE = BIT(0), + MAC_ASYM_PAUSE = BIT(1), + MAC_10HD = BIT(2), + MAC_10FD = BIT(3), + MAC_10 = MAC_10HD | MAC_10FD, + MAC_100HD = BIT(4), + MAC_100FD = BIT(5), + MAC_100 = MAC_100HD | MAC_100FD, + MAC_1000HD = BIT(6), + MAC_1000FD = BIT(7), + MAC_1000 = MAC_1000HD | MAC_1000FD, + MAC_2500FD = BIT(8), + MAC_5000FD = BIT(9), + MAC_10000FD = BIT(10), + MAC_20000FD = BIT(11), + MAC_25000FD = BIT(12), + MAC_40000FD = BIT(13), + MAC_50000FD = BIT(14), + MAC_56000FD = BIT(15), + MAC_100000FD = BIT(16), + MAC_200000FD = BIT(17), + MAC_400000FD = BIT(18), }; static inline bool phylink_autoneg_inband(unsigned int mode) @@ -69,6 +92,7 @@ enum phylink_op_type { * if MAC link is at %MLO_AN_FIXED mode. * @supported_interfaces: bitmap describing which PHY_INTERFACE_MODE_xxx * are supported by the MAC/PCS. + * @mac_capabilities: MAC pause/speed/duplex capabilities. */ struct phylink_config { struct device *dev; @@ -79,6 +103,7 @@ struct phylink_config { void (*get_fixed_state)(struct phylink_config *config, struct phylink_link_state *state); DECLARE_PHY_INTERFACE_MASK(supported_interfaces); + unsigned long mac_capabilities; }; /** @@ -442,6 +467,12 @@ void pcs_link_up(struct phylink_pcs *pcs, unsigned int mode, phy_interface_t interface, int speed, int duplex); #endif +void phylink_get_linkmodes(unsigned long *linkmodes, phy_interface_t interface, + unsigned long mac_capabilities); +void phylink_generic_validate(struct phylink_config *config, + unsigned long *supported, + struct phylink_link_state *state); + struct phylink *phylink_create(struct phylink_config *, struct fwnode_handle *, phy_interface_t iface, const struct phylink_mac_ops *mac_ops); -- cgit v1.2.3 From 2f6a470d6545841cf1891b87e360d3998ef024c8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 15 Nov 2021 07:49:46 -0800 Subject: Revert "Merge branch 'mctp-i2c-driver'" This reverts commit 71812af7234f30362b43ccff33f93890ae4c0655, reversing changes made to cc0be1ad686fb29a4d127948486f40b17fb34b50. Wolfram Sang says: Please revert. Besides the driver in net, it modifies the I2C core code. This has not been acked by the I2C maintainer (in this case me). So, please don't pull this in via the net tree. The question raised here (extending SMBus calls to 255 byte) is complicated because we need ABI backwards compatibility. Link: https://lore.kernel.org/all/YZJ9H4eM%2FM7OXVN0@shikoro/ Signed-off-by: Jakub Kicinski --- include/linux/i2c.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 353d6b4e7a53..16119ac1aa97 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -52,19 +52,6 @@ typedef int (*i2c_slave_cb_t)(struct i2c_client *client, struct module; struct property_entry; -/* SMBus 3.0 extends the maximum block read/write size to 255 (from 32). - * The larger size is only supported by some drivers, indicated by - * the I2C_FUNC_SMBUS_V3_BLOCK functionality bit. - */ -#define I2C_SMBUS_V3_BLOCK_MAX 255 /* As specified in SMBus 3.0 standard */ - -/* Note compatibility definition in uapi header with 32 byte block */ -union i2c_smbus_data { - __u8 byte; - __u16 word; - __u8 block[I2C_SMBUS_V3_BLOCK_MAX + 1]; /* block[0] is used for length */ -}; - #if IS_ENABLED(CONFIG_I2C) /* Return the Frequency mode string based on the bus frequency */ const char *i2c_freq_mode_string(u32 bus_freq_hz); -- cgit v1.2.3 From f64bd790b750dd281406964af40d16adfc88a074 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 29 Oct 2021 12:51:32 -0700 Subject: ACPI: Keep sub-table parsing infrastructure available for modules The NFIT driver and now the CXL ACPI driver have both open-coded ACPI table parsing. Before another instance is added arrange for the core ACPI sub-table parsing to be optionally available to drivers via the CONFIG_ACPI_TABLE_LIB symbol. If no drivers select the symbol then the infrastructure reverts back to being tagged __init via the __init_or_acpilib annotation. For now, only tag the core sub-table routines and data that the CEDT parsing in the cxl_acpi driver would want to reuse, a CEDT parsing helper is added in a later change. Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Alison Schofield Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/163553709227.2509508.8215196520233473814.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- include/linux/acpi.h | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 143ce7e0bee1..edfa3c8f3562 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -232,14 +232,22 @@ int acpi_locate_initial_tables (void); void acpi_reserve_initial_tables (void); void acpi_table_init_complete (void); int acpi_table_init (void); + +#ifdef CONFIG_ACPI_TABLE_LIB +#define __init_or_acpilib +#define __initdata_or_acpilib +#else +#define __init_or_acpilib __init +#define __initdata_or_acpilib __initdata +#endif + int acpi_table_parse(char *id, acpi_tbl_table_handler handler); -int __init acpi_table_parse_entries(char *id, unsigned long table_size, - int entry_id, - acpi_tbl_entry_handler handler, - unsigned int max_entries); -int __init acpi_table_parse_entries_array(char *id, unsigned long table_size, - struct acpi_subtable_proc *proc, int proc_num, - unsigned int max_entries); +int __init_or_acpilib acpi_table_parse_entries(char *id, + unsigned long table_size, int entry_id, + acpi_tbl_entry_handler handler, unsigned int max_entries); +int __init_or_acpilib acpi_table_parse_entries_array(char *id, + unsigned long table_size, struct acpi_subtable_proc *proc, + int proc_num, unsigned int max_entries); int acpi_table_parse_madt(enum acpi_madt_type id, acpi_tbl_entry_handler handler, unsigned int max_entries); -- cgit v1.2.3 From ad2f63971e9655e3987db32dac85aa50658790eb Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 29 Oct 2021 12:51:37 -0700 Subject: ACPI: Teach ACPI table parsing about the CEDT header format The CEDT adds yet one more unique subtable header type where the length is a 16-bit value. Extend the subtable helpers to detect this scenario. Cc: "Rafael J. Wysocki" Cc: Len Brown Tested-by: Alison Schofield Reviewed-by: Alison Schofield Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/163553709742.2509508.5177761945441327574.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- include/linux/acpi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index edfa3c8f3562..6b7f181d51e2 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -133,6 +133,7 @@ union acpi_subtable_headers { struct acpi_subtable_header common; struct acpi_hmat_structure hmat; struct acpi_prmt_module_header prmt; + struct acpi_cedt_header cedt; }; typedef int (*acpi_tbl_table_handler)(struct acpi_table_header *table); -- cgit v1.2.3 From 2d03e46a4bad20191d07b83ec1242d5f002577be Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 29 Oct 2021 12:51:42 -0700 Subject: ACPI: Add a context argument for table parsing handlers In preparation for drivers reusing the core table parsing infrastructure, arrange for handlers to take a context argument. This allows driver table parsing to wrap ACPI table entries in driver-specific data. The first consumer of this infrastructure is the CEDT parsing that happens in the cxl_acpi driver, add a conditional (CONFIG_ACPI_TABLE_LIB=y) export of a acpi_table_parse_cedt() helper for this case. Cc: "Rafael J. Wysocki" Cc: Len Brown Tested-by: Alison Schofield Reviewed-by: Alison Schofield Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/163553710257.2509508.14310494417463866020.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- include/linux/acpi.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 6b7f181d51e2..95f88108f664 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -141,6 +141,9 @@ typedef int (*acpi_tbl_table_handler)(struct acpi_table_header *table); typedef int (*acpi_tbl_entry_handler)(union acpi_subtable_headers *header, const unsigned long end); +typedef int (*acpi_tbl_entry_handler_arg)(union acpi_subtable_headers *header, + void *arg, const unsigned long end); + /* Debugger support */ struct acpi_debugger_ops { @@ -217,6 +220,8 @@ static inline int acpi_debugger_notify_command_complete(void) struct acpi_subtable_proc { int id; acpi_tbl_entry_handler handler; + acpi_tbl_entry_handler_arg handler_arg; + void *arg; int count; }; @@ -235,9 +240,11 @@ void acpi_table_init_complete (void); int acpi_table_init (void); #ifdef CONFIG_ACPI_TABLE_LIB +#define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, ACPI) #define __init_or_acpilib #define __initdata_or_acpilib #else +#define EXPORT_SYMBOL_ACPI_LIB(x) #define __init_or_acpilib __init #define __initdata_or_acpilib __initdata #endif @@ -252,6 +259,10 @@ int __init_or_acpilib acpi_table_parse_entries_array(char *id, int acpi_table_parse_madt(enum acpi_madt_type id, acpi_tbl_entry_handler handler, unsigned int max_entries); +int __init_or_acpilib +acpi_table_parse_cedt(enum acpi_cedt_type id, + acpi_tbl_entry_handler_arg handler_arg, void *arg); + int acpi_parse_mcfg (struct acpi_table_header *header); void acpi_table_print_madt_entry (struct acpi_subtable_header *madt); -- cgit v1.2.3 From 03b122da74b22fbe7cd98184fa5657a9ce13970c Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 26 Oct 2021 15:00:48 -0700 Subject: x86/sgx: Hook arch_memory_failure() into mainline code Add a call inside memory_failure() to call the arch specific code to check if the address is an SGX EPC page and handle it. Note the SGX EPC pages do not have a "struct page" entry, so the hook goes in at the same point as the device mapping hook. Pull the call to acquire the mutex earlier so the SGX errors are also protected. Make set_mce_nospec() skip SGX pages when trying to adjust the 1:1 map. Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Reviewed-by: Naoya Horiguchi Tested-by: Reinette Chatre Link: https://lkml.kernel.org/r/20211026220050.697075-6-tony.luck@intel.com --- include/linux/mm.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a7e4a9e7d807..57f1aa2a33b6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3231,6 +3231,19 @@ extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); +#ifndef arch_memory_failure +static inline int arch_memory_failure(unsigned long pfn, int flags) +{ + return -ENXIO; +} +#endif + +#ifndef arch_is_platform_page +static inline bool arch_is_platform_page(u64 paddr) +{ + return false; +} +#endif /* * Error handlers for various types of pages. -- cgit v1.2.3 From 749303055b78bc38ec0790ccc596cae235446367 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Mon, 15 Nov 2021 12:02:15 +0000 Subject: firmware: cs_dsp: tidy includes in cs_dsp.c and cs_dsp.h This patch removes unused included header files and moves others into cs_dsp.h to ensure that types referenced in the header file are properly described to prevent compiler warnings. Signed-off-by: Simon Trimmer Acked-by: Charles Keepax Link: https://lore.kernel.org/r/20211115120215.56824-1-simont@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h index 9ad9eaaaa552..3a54b1afc48f 100644 --- a/include/linux/firmware/cirrus/cs_dsp.h +++ b/include/linux/firmware/cirrus/cs_dsp.h @@ -11,6 +11,11 @@ #ifndef __CS_DSP_H #define __CS_DSP_H +#include +#include +#include +#include + #define CS_ADSP2_REGION_0 BIT(0) #define CS_ADSP2_REGION_1 BIT(1) #define CS_ADSP2_REGION_2 BIT(2) -- cgit v1.2.3 From e9380df851878cee71df5a1c7611584421527f7e Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Sun, 31 Oct 2021 20:48:52 -0500 Subject: ACPI: Add stubs for wakeup handler functions The commit ddfd9dcf270c ("ACPI: PM: Add acpi_[un]register_wakeup_handler()") added new functions for drivers to use during the s2idle wakeup path, but didn't add stubs for when CONFIG_ACPI wasn't set. Add those stubs in for other drivers to be able to use. Fixes: ddfd9dcf270c ("ACPI: PM: Add acpi_[un]register_wakeup_handler()") Acked-by: Rafael J. Wysocki Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20211101014853.6177-1-mario.limonciello@amd.com Signed-off-by: Linus Walleij --- include/linux/acpi.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 143ce7e0bee1..668d007f0917 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -974,6 +974,15 @@ static inline int acpi_get_local_address(acpi_handle handle, u32 *addr) return -ENODEV; } +static inline int acpi_register_wakeup_handler(int wake_irq, + bool (*wakeup)(void *context), void *context) +{ + return -ENXIO; +} + +static inline void acpi_unregister_wakeup_handler( + bool (*wakeup)(void *context), void *context) { } + #endif /* !CONFIG_ACPI */ #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC -- cgit v1.2.3 From a3143f7822a9eeb38f0e046080ae8f79f6c7122d Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Tue, 2 Nov 2021 16:01:58 -0600 Subject: Remove unused header Commit 6a80b30086b8 ("fmc: Delete the FMC subsystem") removed the last user of , but left the header file behind. Nothing uses this file, delete it now. Cc: Linus Walleij Cc: Alessandro Rubini Signed-off-by: Jonathan Corbet Acked-by: Alessandro Rubini Link: https://lore.kernel.org/r/20211102220203.940290-5-corbet@lwn.net Signed-off-by: Linus Walleij --- include/linux/sdb.h | 160 ---------------------------------------------------- 1 file changed, 160 deletions(-) delete mode 100644 include/linux/sdb.h (limited to 'include/linux') diff --git a/include/linux/sdb.h b/include/linux/sdb.h deleted file mode 100644 index a2404a2bbd10..000000000000 --- a/include/linux/sdb.h +++ /dev/null @@ -1,160 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * This is the official version 1.1 of sdb.h - */ -#ifndef __SDB_H__ -#define __SDB_H__ -#ifdef __KERNEL__ -#include -#else -#include -#endif - -/* - * All structures are 64 bytes long and are expected - * to live in an array, one for each interconnect. - * Most fields of the structures are shared among the - * various types, and most-specific fields are at the - * beginning (for alignment reasons, and to keep the - * magic number at the head of the interconnect record - */ - -/* Product, 40 bytes at offset 24, 8-byte aligned - * - * device_id is vendor-assigned; version is device-specific, - * date is hex (e.g 0x20120501), name is UTF-8, blank-filled - * and not terminated with a 0 byte. - */ -struct sdb_product { - uint64_t vendor_id; /* 0x18..0x1f */ - uint32_t device_id; /* 0x20..0x23 */ - uint32_t version; /* 0x24..0x27 */ - uint32_t date; /* 0x28..0x2b */ - uint8_t name[19]; /* 0x2c..0x3e */ - uint8_t record_type; /* 0x3f */ -}; - -/* - * Component, 56 bytes at offset 8, 8-byte aligned - * - * The address range is first to last, inclusive - * (for example 0x100000 - 0x10ffff) - */ -struct sdb_component { - uint64_t addr_first; /* 0x08..0x0f */ - uint64_t addr_last; /* 0x10..0x17 */ - struct sdb_product product; /* 0x18..0x3f */ -}; - -/* Type of the SDB record */ -enum sdb_record_type { - sdb_type_interconnect = 0x00, - sdb_type_device = 0x01, - sdb_type_bridge = 0x02, - sdb_type_integration = 0x80, - sdb_type_repo_url = 0x81, - sdb_type_synthesis = 0x82, - sdb_type_empty = 0xFF, -}; - -/* Type 0: interconnect (first of the array) - * - * sdb_records is the length of the table including this first - * record, version is 1. The bus type is enumerated later. - */ -#define SDB_MAGIC 0x5344422d /* "SDB-" */ -struct sdb_interconnect { - uint32_t sdb_magic; /* 0x00-0x03 */ - uint16_t sdb_records; /* 0x04-0x05 */ - uint8_t sdb_version; /* 0x06 */ - uint8_t sdb_bus_type; /* 0x07 */ - struct sdb_component sdb_component; /* 0x08-0x3f */ -}; - -/* Type 1: device - * - * class is 0 for "custom device", other values are - * to be standardized; ABI version is for the driver, - * bus-specific bits are defined by each bus (see below) - */ -struct sdb_device { - uint16_t abi_class; /* 0x00-0x01 */ - uint8_t abi_ver_major; /* 0x02 */ - uint8_t abi_ver_minor; /* 0x03 */ - uint32_t bus_specific; /* 0x04-0x07 */ - struct sdb_component sdb_component; /* 0x08-0x3f */ -}; - -/* Type 2: bridge - * - * child is the address of the nested SDB table - */ -struct sdb_bridge { - uint64_t sdb_child; /* 0x00-0x07 */ - struct sdb_component sdb_component; /* 0x08-0x3f */ -}; - -/* Type 0x80: integration - * - * all types with bit 7 set are meta-information, so - * software can ignore the types it doesn't know. Here we - * just provide product information for an aggregate device - */ -struct sdb_integration { - uint8_t reserved[24]; /* 0x00-0x17 */ - struct sdb_product product; /* 0x08-0x3f */ -}; - -/* Type 0x81: Top module repository url - * - * again, an informative field that software can ignore - */ -struct sdb_repo_url { - uint8_t repo_url[63]; /* 0x00-0x3e */ - uint8_t record_type; /* 0x3f */ -}; - -/* Type 0x82: Synthesis tool information - * - * this informative record - */ -struct sdb_synthesis { - uint8_t syn_name[16]; /* 0x00-0x0f */ - uint8_t commit_id[16]; /* 0x10-0x1f */ - uint8_t tool_name[8]; /* 0x20-0x27 */ - uint32_t tool_version; /* 0x28-0x2b */ - uint32_t date; /* 0x2c-0x2f */ - uint8_t user_name[15]; /* 0x30-0x3e */ - uint8_t record_type; /* 0x3f */ -}; - -/* Type 0xff: empty - * - * this allows keeping empty slots during development, - * so they can be filled later with minimal efforts and - * no misleading description is ever shipped -- hopefully. - * It can also be used to pad a table to a desired length. - */ -struct sdb_empty { - uint8_t reserved[63]; /* 0x00-0x3e */ - uint8_t record_type; /* 0x3f */ -}; - -/* The type of bus, for bus-specific flags */ -enum sdb_bus_type { - sdb_wishbone = 0x00, - sdb_data = 0x01, -}; - -#define SDB_WB_WIDTH_MASK 0x0f -#define SDB_WB_ACCESS8 0x01 -#define SDB_WB_ACCESS16 0x02 -#define SDB_WB_ACCESS32 0x04 -#define SDB_WB_ACCESS64 0x08 -#define SDB_WB_LITTLE_ENDIAN 0x80 - -#define SDB_DATA_READ 0x04 -#define SDB_DATA_WRITE 0x02 -#define SDB_DATA_EXEC 0x01 - -#endif /* __SDB_H__ */ -- cgit v1.2.3 From 353050be4c19e102178ccc05988101887c25ae53 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Nov 2021 18:48:08 +0000 Subject: bpf: Fix toctou on read-only map's constant scalar tracking Commit a23740ec43ba ("bpf: Track contents of read-only maps as scalars") is checking whether maps are read-only both from BPF program side and user space side, and then, given their content is constant, reading out their data via map->ops->map_direct_value_addr() which is then subsequently used as known scalar value for the register, that is, it is marked as __mark_reg_known() with the read value at verification time. Before a23740ec43ba, the register content was marked as an unknown scalar so the verifier could not make any assumptions about the map content. The current implementation however is prone to a TOCTOU race, meaning, the value read as known scalar for the register is not guaranteed to be exactly the same at a later point when the program is executed, and as such, the prior made assumptions of the verifier with regards to the program will be invalid which can cause issues such as OOB access, etc. While the BPF_F_RDONLY_PROG map flag is always fixed and required to be specified at map creation time, the map->frozen property is initially set to false for the map given the map value needs to be populated, e.g. for global data sections. Once complete, the loader "freezes" the map from user space such that no subsequent updates/deletes are possible anymore. For the rest of the lifetime of the map, this freeze one-time trigger cannot be undone anymore after a successful BPF_MAP_FREEZE cmd return. Meaning, any new BPF_* cmd calls which would update/delete map entries will be rejected with -EPERM since map_get_sys_perms() removes the FMODE_CAN_WRITE permission. This also means that pending update/delete map entries must still complete before this guarantee is given. This corner case is not an issue for loaders since they create and prepare such program private map in successive steps. However, a malicious user is able to trigger this TOCTOU race in two different ways: i) via userfaultfd, and ii) via batched updates. For i) userfaultfd is used to expand the competition interval, so that map_update_elem() can modify the contents of the map after map_freeze() and bpf_prog_load() were executed. This works, because userfaultfd halts the parallel thread which triggered a map_update_elem() at the time where we copy key/value from the user buffer and this already passed the FMODE_CAN_WRITE capability test given at that time the map was not "frozen". Then, the main thread performs the map_freeze() and bpf_prog_load(), and once that had completed successfully, the other thread is woken up to complete the pending map_update_elem() which then changes the map content. For ii) the idea of the batched update is similar, meaning, when there are a large number of updates to be processed, it can increase the competition interval between the two. It is therefore possible in practice to modify the contents of the map after executing map_freeze() and bpf_prog_load(). One way to fix both i) and ii) at the same time is to expand the use of the map's map->writecnt. The latter was introduced in fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY") and further refined in 1f6cb19be2e2 ("bpf: Prevent re-mmap()'ing BPF map as writable for initially r/o mapping") with the rationale to make a writable mmap()'ing of a map mutually exclusive with read-only freezing. The counter indicates writable mmap() mappings and then prevents/fails the freeze operation. Its semantics can be expanded beyond just mmap() by generally indicating ongoing write phases. This would essentially span any parallel regular and batched flavor of update/delete operation and then also have map_freeze() fail with -EBUSY. For the check_mem_access() in the verifier we expand upon the bpf_map_is_rdonly() check ensuring that all last pending writes have completed via bpf_map_write_active() test. Once the map->frozen is set and bpf_map_write_active() indicates a map->writecnt of 0 only then we are really guaranteed to use the map's data as known constants. For map->frozen being set and pending writes in process of still being completed we fall back to marking that register as unknown scalar so we don't end up making assumptions about it. With this, both TOCTOU reproducers from i) and ii) are fixed. Note that the map->writecnt has been converted into a atomic64 in the fix in order to avoid a double freeze_mutex mutex_{un,}lock() pair when updating map->writecnt in the various map update/delete BPF_* cmd flavors. Spanning the freeze_mutex over entire map update/delete operations in syscall side would not be possible due to then causing everything to be serialized. Similarly, something like synchronize_rcu() after setting map->frozen to wait for update/deletes to complete is not possible either since it would also have to span the user copy which can sleep. On the libbpf side, this won't break d66562fba1ce ("libbpf: Add BPF object skeleton support") as the anonymous mmap()-ed "map initialization image" is remapped as a BPF map-backed mmap()-ed memory where for .rodata it's non-writable. Fixes: a23740ec43ba ("bpf: Track contents of read-only maps as scalars") Reported-by: w1tcher.bupt@gmail.com Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f715e8863f4d..e7a163a3146b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -193,7 +193,7 @@ struct bpf_map { atomic64_t usercnt; struct work_struct work; struct mutex freeze_mutex; - u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */ + atomic64_t writecnt; }; static inline bool map_value_has_spin_lock(const struct bpf_map *map) @@ -1419,6 +1419,7 @@ void bpf_map_put(struct bpf_map *map); void *bpf_map_area_alloc(u64 size, int numa_node); void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); +bool bpf_map_write_active(const struct bpf_map *map); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, -- cgit v1.2.3 From 0f0ac158d28ff78e75c334e869b1cb8e69372a1f Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 24 Oct 2021 16:37:05 +1300 Subject: platform/x86: asus-wmi: Add support for custom fan curves Add support for custom fan curves found on some ASUS ROG laptops. These laptops have the ability to set a custom curve for the CPU and GPU fans via two ACPI methods. This patch adds two pwm attributes to the hwmon sysfs, pwm1 for CPU fan, pwm2 for GPU fan. Both are under the hwmon of the name `asus_custom_fan_curve`. There is no safety check of the set fan curves - this must be done in userspace. The fans have settings [1,2,3] under pwm_enable: 1. Enable and write settings out 2. Disable and use factory fan mode 3. Same as 2, additionally restoring default factory curve. Use of 2 means that the curve the user has set is still stored and won't be erased, but the laptop will be using its default auto-fan mode. Re-enabling the manual mode then activates the curves again. Notes: - pwm_enable = 0 is an invalid setting. - pwm is actually a percentage and is scaled on writing to device. Signed-off-by: Luke D. Jones Link: https://lore.kernel.org/r/20211024033705.5595-2-luke@ljones.dev Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/asus-wmi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 17dc5cb6f3f2..a571b47ff362 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -77,6 +77,8 @@ #define ASUS_WMI_DEVID_THERMAL_CTRL 0x00110011 #define ASUS_WMI_DEVID_FAN_CTRL 0x00110012 /* deprecated */ #define ASUS_WMI_DEVID_CPU_FAN_CTRL 0x00110013 +#define ASUS_WMI_DEVID_CPU_FAN_CURVE 0x00110024 +#define ASUS_WMI_DEVID_GPU_FAN_CURVE 0x00110025 /* Power */ #define ASUS_WMI_DEVID_PROCESSOR_STATE 0x00120012 -- cgit v1.2.3 From 38543b72fbe52b7eec0dedd420d80a06c652d8e4 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Thu, 28 Oct 2021 02:22:41 +0200 Subject: platform/surface: aggregator: Make client device removal more generic Currently, there are similar functions defined in the Aggregator Registry and the controller core. Make client device removal more generic and export it. We can then use this function later on to remove client devices from device hubs as well as the controller and avoid re-defining similar things. Signed-off-by: Maximilian Luz Link: https://lore.kernel.org/r/20211028002243.1586083-2-luzmaximilian@gmail.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/surface_aggregator/device.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h index f636c5310321..cc257097eb05 100644 --- a/include/linux/surface_aggregator/device.h +++ b/include/linux/surface_aggregator/device.h @@ -319,6 +319,15 @@ void ssam_device_driver_unregister(struct ssam_device_driver *d); ssam_device_driver_unregister) +/* -- Helpers for controller and hub devices. ------------------------------- */ + +#ifdef CONFIG_SURFACE_AGGREGATOR_BUS +void ssam_remove_clients(struct device *dev); +#else /* CONFIG_SURFACE_AGGREGATOR_BUS */ +static inline void ssam_remove_clients(struct device *dev) {} +#endif /* CONFIG_SURFACE_AGGREGATOR_BUS */ + + /* -- Helpers for client-device requests. ----------------------------------- */ /** -- cgit v1.2.3 From ebf7f6f0a6cdcc17a3da52b81e4b3a98c4005028 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Fri, 5 Nov 2021 09:30:00 +0800 Subject: bpf: Change value of MAX_TAIL_CALL_CNT from 32 to 33 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the current code, the actual max tail call count is 33 which is greater than MAX_TAIL_CALL_CNT (defined as 32). The actual limit is not consistent with the meaning of MAX_TAIL_CALL_CNT and thus confusing at first glance. We can see the historical evolution from commit 04fd61ab36ec ("bpf: allow bpf programs to tail-call other bpf programs") and commit f9dabe016b63 ("bpf: Undo off-by-one in interpreter tail call count limit"). In order to avoid changing existing behavior, the actual limit is 33 now, this is reasonable. After commit 874be05f525e ("bpf, tests: Add tail call test suite"), we can see there exists failed testcase. On all archs when CONFIG_BPF_JIT_ALWAYS_ON is not set: # echo 0 > /proc/sys/net/core/bpf_jit_enable # modprobe test_bpf # dmesg | grep -w FAIL Tail call error path, max count reached jited:0 ret 34 != 33 FAIL On some archs: # echo 1 > /proc/sys/net/core/bpf_jit_enable # modprobe test_bpf # dmesg | grep -w FAIL Tail call error path, max count reached jited:1 ret 34 != 33 FAIL Although the above failed testcase has been fixed in commit 18935a72eb25 ("bpf/tests: Fix error in tail call limit tests"), it would still be good to change the value of MAX_TAIL_CALL_CNT from 32 to 33 to make the code more readable. The 32-bit x86 JIT was using a limit of 32, just fix the wrong comments and limit to 33 tail calls as the constant MAX_TAIL_CALL_CNT updated. For the mips64 JIT, use "ori" instead of "addiu" as suggested by Johan Almbladh. For the riscv JIT, use RV_REG_TCC directly to save one register move as suggested by Björn Töpel. For the other implementations, no function changes, it does not change the current limit 33, the new value of MAX_TAIL_CALL_CNT can reflect the actual max tail call count, the related tail call testcases in test_bpf module and selftests can work well for the interpreter and the JIT. Here are the test results on x86_64: # uname -m x86_64 # echo 0 > /proc/sys/net/core/bpf_jit_enable # modprobe test_bpf test_suite=test_tail_calls # dmesg | tail -1 test_bpf: test_tail_calls: Summary: 8 PASSED, 0 FAILED, [0/8 JIT'ed] # rmmod test_bpf # echo 1 > /proc/sys/net/core/bpf_jit_enable # modprobe test_bpf test_suite=test_tail_calls # dmesg | tail -1 test_bpf: test_tail_calls: Summary: 8 PASSED, 0 FAILED, [8/8 JIT'ed] # rmmod test_bpf # ./test_progs -t tailcalls #142 tailcalls:OK Summary: 1/11 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Tiezhu Yang Signed-off-by: Daniel Borkmann Tested-by: Johan Almbladh Tested-by: Ilya Leoshkevich Acked-by: Björn Töpel Acked-by: Johan Almbladh Acked-by: Ilya Leoshkevich Link: https://lore.kernel.org/bpf/1636075800-3264-1-git-send-email-yangtiezhu@loongson.cn --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 56098c866704..cc7a0c36e7df 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1081,7 +1081,7 @@ struct bpf_array { }; #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ -#define MAX_TAIL_CALL_CNT 32 +#define MAX_TAIL_CALL_CNT 33 #define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ BPF_F_RDONLY_PROG | \ -- cgit v1.2.3 From 42f67eea3ba36cef2dce2e853de6ddcb2e89eb39 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:33 -0800 Subject: net: use sk_is_tcp() in more places Move sk_is_tcp() to include/net/sock.h and use it where we can. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skmsg.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 584d94be9c8b..18a717fe62eb 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -507,12 +507,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return !!psock->saved_data_ready; } -static inline bool sk_is_tcp(const struct sock *sk) -{ - return sk->sk_type == SOCK_STREAM && - sk->sk_protocol == IPPROTO_TCP; -} - static inline bool sk_is_udp(const struct sock *sk) { return sk->sk_type == SOCK_DGRAM && -- cgit v1.2.3 From f35f821935d8df76f9c92e2431a225bdff938169 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:46 -0800 Subject: tcp: defer skb freeing after socket lock is released tcp recvmsg() (or rx zerocopy) spends a fair amount of time freeing skbs after their payload has been consumed. A typical ~64KB GRO packet has to release ~45 page references, eventually going to page allocator for each of them. Currently, this freeing is performed while socket lock is held, meaning that there is a high chance that BH handler has to queue incoming packets to tcp socket backlog. This can cause additional latencies, because the user thread has to process the backlog at release_sock() time, and while doing so, additional frames can be added by BH handler. This patch adds logic to defer these frees after socket lock is released, or directly from BH handler if possible. Being able to free these skbs from BH handler helps a lot, because this avoids the usual alloc/free assymetry, when BH handler and user thread do not run on same cpu or NUMA node. One cpu can now be fully utilized for the kernel->user copy, and another cpu is handling BH processing and skb/page allocs/frees (assuming RFS is not forcing use of a single CPU) Tested: 100Gbit NIC Max throughput for one TCP_STREAM flow, over 10 runs MTU : 1500 Before: 55 Gbit After: 66 Gbit MTU : 4096+(headers) Before: 82 Gbit After: 95 Gbit Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 686a666d073d..b8b806512e16 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #if IS_ENABLED(CONFIG_NF_CONNTRACK) @@ -743,6 +744,7 @@ struct sk_buff { }; struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ struct list_head list; + struct llist_node ll_node; }; union { -- cgit v1.2.3 From 4721031c3559db8eae61df305f10c00099a7c1d0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:05:51 -0800 Subject: net: move gro definitions to include/net/gro.h include/linux/netdevice.h became too big, move gro stuff into include/net/gro.h Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 348 ---------------------------------------------- 1 file changed, 348 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ec42495a43a..d95c9839ce90 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2520,109 +2520,6 @@ static inline void netif_napi_del(struct napi_struct *napi) synchronize_net(); } -struct napi_gro_cb { - /* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */ - void *frag0; - - /* Length of frag0. */ - unsigned int frag0_len; - - /* This indicates where we are processing relative to skb->data. */ - int data_offset; - - /* This is non-zero if the packet cannot be merged with the new skb. */ - u16 flush; - - /* Save the IP ID here and check when we get to the transport layer */ - u16 flush_id; - - /* Number of segments aggregated. */ - u16 count; - - /* Start offset for remote checksum offload */ - u16 gro_remcsum_start; - - /* jiffies when first packet was created/queued */ - unsigned long age; - - /* Used in ipv6_gro_receive() and foo-over-udp */ - u16 proto; - - /* This is non-zero if the packet may be of the same flow. */ - u8 same_flow:1; - - /* Used in tunnel GRO receive */ - u8 encap_mark:1; - - /* GRO checksum is valid */ - u8 csum_valid:1; - - /* Number of checksums via CHECKSUM_UNNECESSARY */ - u8 csum_cnt:3; - - /* Free the skb? */ - u8 free:2; -#define NAPI_GRO_FREE 1 -#define NAPI_GRO_FREE_STOLEN_HEAD 2 - - /* Used in foo-over-udp, set in udp[46]_gro_receive */ - u8 is_ipv6:1; - - /* Used in GRE, set in fou/gue_gro_receive */ - u8 is_fou:1; - - /* Used to determine if flush_id can be ignored */ - u8 is_atomic:1; - - /* Number of gro_receive callbacks this packet already went through */ - u8 recursion_counter:4; - - /* GRO is done by frag_list pointer chaining. */ - u8 is_flist:1; - - /* used to support CHECKSUM_COMPLETE for tunneling protocols */ - __wsum csum; - - /* used in skb_gro_receive() slow path */ - struct sk_buff *last; -}; - -#define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) - -#define GRO_RECURSION_LIMIT 15 -static inline int gro_recursion_inc_test(struct sk_buff *skb) -{ - return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT; -} - -typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *); -static inline struct sk_buff *call_gro_receive(gro_receive_t cb, - struct list_head *head, - struct sk_buff *skb) -{ - if (unlikely(gro_recursion_inc_test(skb))) { - NAPI_GRO_CB(skb)->flush |= 1; - return NULL; - } - - return cb(head, skb); -} - -typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *, - struct sk_buff *); -static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb, - struct sock *sk, - struct list_head *head, - struct sk_buff *skb) -{ - if (unlikely(gro_recursion_inc_test(skb))) { - NAPI_GRO_CB(skb)->flush |= 1; - return NULL; - } - - return cb(sk, head, skb); -} - struct packet_type { __be16 type; /* This is really htons(ether_type). */ bool ignore_outgoing; @@ -3008,251 +2905,6 @@ int dev_restart(struct net_device *dev); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); -static inline unsigned int skb_gro_offset(const struct sk_buff *skb) -{ - return NAPI_GRO_CB(skb)->data_offset; -} - -static inline unsigned int skb_gro_len(const struct sk_buff *skb) -{ - return skb->len - NAPI_GRO_CB(skb)->data_offset; -} - -static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len) -{ - NAPI_GRO_CB(skb)->data_offset += len; -} - -static inline void *skb_gro_header_fast(struct sk_buff *skb, - unsigned int offset) -{ - return NAPI_GRO_CB(skb)->frag0 + offset; -} - -static inline int skb_gro_header_hard(struct sk_buff *skb, unsigned int hlen) -{ - return NAPI_GRO_CB(skb)->frag0_len < hlen; -} - -static inline void skb_gro_frag0_invalidate(struct sk_buff *skb) -{ - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; -} - -static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, - unsigned int offset) -{ - if (!pskb_may_pull(skb, hlen)) - return NULL; - - skb_gro_frag0_invalidate(skb); - return skb->data + offset; -} - -static inline void *skb_gro_network_header(struct sk_buff *skb) -{ - return (NAPI_GRO_CB(skb)->frag0 ?: skb->data) + - skb_network_offset(skb); -} - -static inline void skb_gro_postpull_rcsum(struct sk_buff *skb, - const void *start, unsigned int len) -{ - if (NAPI_GRO_CB(skb)->csum_valid) - NAPI_GRO_CB(skb)->csum = csum_sub(NAPI_GRO_CB(skb)->csum, - csum_partial(start, len, 0)); -} - -/* GRO checksum functions. These are logical equivalents of the normal - * checksum functions (in skbuff.h) except that they operate on the GRO - * offsets and fields in sk_buff. - */ - -__sum16 __skb_gro_checksum_complete(struct sk_buff *skb); - -static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb) -{ - return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb)); -} - -static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb, - bool zero_okay, - __sum16 check) -{ - return ((skb->ip_summed != CHECKSUM_PARTIAL || - skb_checksum_start_offset(skb) < - skb_gro_offset(skb)) && - !skb_at_gro_remcsum_start(skb) && - NAPI_GRO_CB(skb)->csum_cnt == 0 && - (!zero_okay || check)); -} - -static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb, - __wsum psum) -{ - if (NAPI_GRO_CB(skb)->csum_valid && - !csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum))) - return 0; - - NAPI_GRO_CB(skb)->csum = psum; - - return __skb_gro_checksum_complete(skb); -} - -static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) -{ - if (NAPI_GRO_CB(skb)->csum_cnt > 0) { - /* Consume a checksum from CHECKSUM_UNNECESSARY */ - NAPI_GRO_CB(skb)->csum_cnt--; - } else { - /* Update skb for CHECKSUM_UNNECESSARY and csum_level when we - * verified a new top level checksum or an encapsulated one - * during GRO. This saves work if we fallback to normal path. - */ - __skb_incr_checksum_unnecessary(skb); - } -} - -#define __skb_gro_checksum_validate(skb, proto, zero_okay, check, \ - compute_pseudo) \ -({ \ - __sum16 __ret = 0; \ - if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \ - __ret = __skb_gro_checksum_validate_complete(skb, \ - compute_pseudo(skb, proto)); \ - if (!__ret) \ - skb_gro_incr_csum_unnecessary(skb); \ - __ret; \ -}) - -#define skb_gro_checksum_validate(skb, proto, compute_pseudo) \ - __skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo) - -#define skb_gro_checksum_validate_zero_check(skb, proto, check, \ - compute_pseudo) \ - __skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo) - -#define skb_gro_checksum_simple_validate(skb) \ - __skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo) - -static inline bool __skb_gro_checksum_convert_check(struct sk_buff *skb) -{ - return (NAPI_GRO_CB(skb)->csum_cnt == 0 && - !NAPI_GRO_CB(skb)->csum_valid); -} - -static inline void __skb_gro_checksum_convert(struct sk_buff *skb, - __wsum pseudo) -{ - NAPI_GRO_CB(skb)->csum = ~pseudo; - NAPI_GRO_CB(skb)->csum_valid = 1; -} - -#define skb_gro_checksum_try_convert(skb, proto, compute_pseudo) \ -do { \ - if (__skb_gro_checksum_convert_check(skb)) \ - __skb_gro_checksum_convert(skb, \ - compute_pseudo(skb, proto)); \ -} while (0) - -struct gro_remcsum { - int offset; - __wsum delta; -}; - -static inline void skb_gro_remcsum_init(struct gro_remcsum *grc) -{ - grc->offset = 0; - grc->delta = 0; -} - -static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr, - unsigned int off, size_t hdrlen, - int start, int offset, - struct gro_remcsum *grc, - bool nopartial) -{ - __wsum delta; - size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); - - BUG_ON(!NAPI_GRO_CB(skb)->csum_valid); - - if (!nopartial) { - NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start; - return ptr; - } - - ptr = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, off + plen)) { - ptr = skb_gro_header_slow(skb, off + plen, off); - if (!ptr) - return NULL; - } - - delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum, - start, offset); - - /* Adjust skb->csum since we changed the packet */ - NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); - - grc->offset = off + hdrlen + offset; - grc->delta = delta; - - return ptr; -} - -static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, - struct gro_remcsum *grc) -{ - void *ptr; - size_t plen = grc->offset + sizeof(u16); - - if (!grc->delta) - return; - - ptr = skb_gro_header_fast(skb, grc->offset); - if (skb_gro_header_hard(skb, grc->offset + sizeof(u16))) { - ptr = skb_gro_header_slow(skb, plen, grc->offset); - if (!ptr) - return; - } - - remcsum_unadjust((__sum16 *)ptr, grc->delta); -} - -#ifdef CONFIG_XFRM_OFFLOAD -static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) -{ - if (PTR_ERR(pp) != -EINPROGRESS) - NAPI_GRO_CB(skb)->flush |= flush; -} -static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, - struct sk_buff *pp, - int flush, - struct gro_remcsum *grc) -{ - if (PTR_ERR(pp) != -EINPROGRESS) { - NAPI_GRO_CB(skb)->flush |= flush; - skb_gro_remcsum_cleanup(skb, grc); - skb->remcsum_offload = 0; - } -} -#else -static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) -{ - NAPI_GRO_CB(skb)->flush |= flush; -} -static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, - struct sk_buff *pp, - int flush, - struct gro_remcsum *grc) -{ - NAPI_GRO_CB(skb)->flush |= flush; - skb_gro_remcsum_cleanup(skb, grc); - skb->remcsum_offload = 0; -} -#endif static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, -- cgit v1.2.3 From 0b935d7f8c07bf0a192712bdbf76dbf45ef8b115 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:05:52 -0800 Subject: net: gro: move skb_gro_receive_list to udp_offload.c This helper is used once, no need to keep it in fat net/core/skbuff.c Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d95c9839ce90..ce6ee1453dbc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2903,7 +2903,6 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id); int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); -int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, -- cgit v1.2.3 From e456a18a390b96f22b0de2acd4d0f49c72ed2280 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:05:53 -0800 Subject: net: gro: move skb_gro_receive into net/core/gro.c net/core/gro.c will contain all core gro functions, to shrink net/core/skbuff.c and net/core/dev.c Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ce6ee1453dbc..93d397db9ec4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2902,7 +2902,6 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); struct net_device *dev_get_by_napi_id(unsigned int napi_id); int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); -int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, -- cgit v1.2.3 From 587652bbdd06ab38a4c1b85e40f933d2cf4a1147 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:05:54 -0800 Subject: net: gro: populate net/core/gro.c Move gro code and data from net/core/dev.c to net/core/gro.c to ease maintenance. gro_normal_list() and gro_normal_one() are inlined because they are called from both files. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 93d397db9ec4..31a7e6b27681 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3657,6 +3657,7 @@ int netif_rx_ni(struct sk_buff *skb); int netif_rx_any_context(struct sk_buff *skb); int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); +void netif_receive_skb_list_internal(struct list_head *head); void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); -- cgit v1.2.3 From 77d641baa3c8e18a1056bec6c64c6103c1a17b1e Mon Sep 17 00:00:00 2001 From: Luca Ceresoli Date: Mon, 8 Nov 2021 17:27:05 +0100 Subject: power: supply: core: add POWER_SUPPLY_HEALTH_NO_BATTERY Some chargers can keep the system powered from the mains even when no battery is present. It this case none of the currently defined health statuses applies. Add a new status to report that no battery is present. Suggested-by: Sebastian Reichel Signed-off-by: Luca Ceresoli Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 9ca1f120a211..2d1318fe2455 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -66,6 +66,7 @@ enum { POWER_SUPPLY_HEALTH_WARM, POWER_SUPPLY_HEALTH_COOL, POWER_SUPPLY_HEALTH_HOT, + POWER_SUPPLY_HEALTH_NO_BATTERY, }; enum { -- cgit v1.2.3 From d7751d6476185ff754b9dad2cba0c0a6e43ecadc Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Thu, 20 May 2021 17:09:58 +0300 Subject: net/mlx5: E-Switch, Fix resetting of encap mode when entering switchdev E-Switch encap mode is relevant only when in switchdev mode. The RDMA driver can query the encap configuration via mlx5_eswitch_get_encap_mode(). Make sure it returns the currently used mode and not the set one. This reverts the cited commit which reset the encap mode on entering switchdev and fixes the original issue properly. Fixes: 9a64144d683a ("net/mlx5: E-Switch, Fix default encap mode") Signed-off-by: Paul Blakey Reviewed-by: Mark Bloch Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/eswitch.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 97afcea39a7b..8b18fe9771f9 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -145,13 +145,13 @@ u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw, GENMASK(31 - ESW_TUN_ID_BITS - ESW_RESERVED_BITS, \ ESW_TUN_OPTS_OFFSET + 1) -u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev); +u8 mlx5_eswitch_mode(const struct mlx5_core_dev *dev); u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev); struct mlx5_core_dev *mlx5_eswitch_get_core_dev(struct mlx5_eswitch *esw); #else /* CONFIG_MLX5_ESWITCH */ -static inline u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev) +static inline u8 mlx5_eswitch_mode(const struct mlx5_core_dev *dev) { return MLX5_ESWITCH_NONE; } -- cgit v1.2.3 From c2c60ea37e5b6be58c9dd7aff0b2e86ba0f18e0b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:23:01 -0800 Subject: once: use __section(".data.once") .data.once contains nicely packed bool variables. It is used already by DO_ONCE_LITE(). Using it also in DO_ONCE() removes holes in .data section. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/once.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/once.h b/include/linux/once.h index d361fb14ac3a..f54523052bbc 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -38,7 +38,7 @@ void __do_once_done(bool *done, struct static_key_true *once_key, #define DO_ONCE(func, ...) \ ({ \ bool ___ret = false; \ - static bool ___done = false; \ + static bool __section(".data.once") ___done = false; \ static DEFINE_STATIC_KEY_TRUE(___once_key); \ if (static_branch_unlikely(&___once_key)) { \ unsigned long ___flags; \ -- cgit v1.2.3 From 7071732c26fe2cf141185ed16a8a85d02495ae8c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:23:02 -0800 Subject: net: use .data.once section in netdev_level_once() Same rationale than prior patch : using the dedicated section avoid holes and pack all these bool values. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 31a7e6b27681..dd328364dfe9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4942,7 +4942,7 @@ void netdev_info(const struct net_device *dev, const char *format, ...); #define netdev_level_once(level, dev, fmt, ...) \ do { \ - static bool __print_once __read_mostly; \ + static bool __section(".data.once") __print_once; \ \ if (!__print_once) { \ __print_once = true; \ -- cgit v1.2.3 From 49ecc2e9c3abd269951972fa8b23a4d081111b80 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:23:03 -0800 Subject: net: align static siphash keys siphash keys use 16 bytes. Define siphash_aligned_key_t macro so that we can make sure they are not crossing a cache line boundary. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/siphash.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/siphash.h b/include/linux/siphash.h index bf21591a9e5e..3f7427b9e935 100644 --- a/include/linux/siphash.h +++ b/include/linux/siphash.h @@ -21,6 +21,8 @@ typedef struct { u64 key[2]; } siphash_key_t; +#define siphash_aligned_key_t siphash_key_t __aligned(16) + static inline bool siphash_key_is_zero(const siphash_key_t *key) { return !(key->key[0] | key->key[1]); -- cgit v1.2.3 From b9241f54138ca5af4d3c5ca6db56be83d7491508 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 15 Nov 2021 17:11:17 +0000 Subject: net: document SMII and correct phylink's new validation mechanism SMII has not been documented in the kernel, but information on this PHY interface mode has been recently found. Document it, and correct the recently introduced phylink handling for this interface mode. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/E1mmfVl-0075nP-14@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 96e43fbb2dd8..1e57cdd95da3 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -99,7 +99,7 @@ extern const int phy_10gbit_features_array[1]; * @PHY_INTERFACE_MODE_RGMII_RXID: RGMII with Internal RX delay * @PHY_INTERFACE_MODE_RGMII_TXID: RGMII with Internal RX delay * @PHY_INTERFACE_MODE_RTBI: Reduced TBI - * @PHY_INTERFACE_MODE_SMII: ??? MII + * @PHY_INTERFACE_MODE_SMII: Serial MII * @PHY_INTERFACE_MODE_XGMII: 10 gigabit media-independent interface * @PHY_INTERFACE_MODE_XLGMII:40 gigabit media-independent interface * @PHY_INTERFACE_MODE_MOCA: Multimedia over Coax -- cgit v1.2.3 From 5854d4a6cc356ba3e16d8593ac1c089a32d1759c Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Fri, 29 Oct 2021 20:26:12 +0300 Subject: mtd: spi-nor: Get rid of nor->page_size nor->page_size duplicated what nor->params->page_size indicates for no good reason. page_size is a flash parameter of fixed value and it is better suited to be found in nor->params->page_size. Signed-off-by: Tudor Ambarus Reviewed-by: Pratyush Yadav Reviewed-by: Michael Walle Link: https://lore.kernel.org/r/20211029172633.886453-5-tudor.ambarus@microchip.com --- include/linux/mtd/spi-nor.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index f67457748ed8..fc90fce26e33 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -371,7 +371,6 @@ struct spi_nor_flash_parameter; * @bouncebuf_size: size of the bounce buffer * @info: SPI NOR part JEDEC MFR ID and other info * @manufacturer: SPI NOR manufacturer - * @page_size: the page size of the SPI NOR * @addr_width: number of address bytes * @erase_opcode: the opcode for erasing a sector * @read_opcode: the read opcode @@ -401,7 +400,6 @@ struct spi_nor { size_t bouncebuf_size; const struct flash_info *info; const struct spi_nor_manufacturer *manufacturer; - u32 page_size; u8 addr_width; u8 erase_opcode; u8 read_opcode; -- cgit v1.2.3 From 1e2c3ef0496e72ba9001da5fd1b7ed56ccb30597 Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Mon, 4 Oct 2021 16:11:52 +0200 Subject: tee: export teedev_open() and teedev_close_context() Exports the two functions teedev_open() and teedev_close_context() in order to make it easier to create a driver internal struct tee_context. Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- include/linux/tee_drv.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index a1f03461369b..468a7d83dc6c 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -587,4 +587,18 @@ struct tee_client_driver { #define to_tee_client_driver(d) \ container_of(d, struct tee_client_driver, driver) +/** + * teedev_open() - Open a struct tee_device + * @teedev: Device to open + * + * @return a pointer to struct tee_context on success or an ERR_PTR on failure. + */ +struct tee_context *teedev_open(struct tee_device *teedev); + +/** + * teedev_close_context() - closes a struct tee_context + * @ctx: The struct tee_context to close + */ +void teedev_close_context(struct tee_context *ctx); + #endif /*__TEE_DRV_H*/ -- cgit v1.2.3 From 49c39ec4670a8f045729e3717af2e1a74caf89a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 13 Sep 2021 14:21:25 +0200 Subject: dma-buf: nuke dma_resv_get_excl_unlocked MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Heureka, that's finally not used any more. Signed-off-by: Christian König Reviewed-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20210917123513.1106-27-christian.koenig@amd.com --- include/linux/dma-resv.h | 26 -------------------------- 1 file changed, 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h index 09c6063b199a..eebf04325b34 100644 --- a/include/linux/dma-resv.h +++ b/include/linux/dma-resv.h @@ -440,32 +440,6 @@ dma_resv_excl_fence(struct dma_resv *obj) return rcu_dereference_check(obj->fence_excl, dma_resv_held(obj)); } -/** - * dma_resv_get_excl_unlocked - get the reservation object's - * exclusive fence, without lock held. - * @obj: the reservation object - * - * If there is an exclusive fence, this atomically increments it's - * reference count and returns it. - * - * RETURNS - * The exclusive fence or NULL if none - */ -static inline struct dma_fence * -dma_resv_get_excl_unlocked(struct dma_resv *obj) -{ - struct dma_fence *fence; - - if (!rcu_access_pointer(obj->fence_excl)) - return NULL; - - rcu_read_lock(); - fence = dma_fence_get_rcu_safe(&obj->fence_excl); - rcu_read_unlock(); - - return fence; -} - /** * dma_resv_shared_list - get the reservation object's shared fence list * @obj: the reservation object -- cgit v1.2.3 From 2fb75e1b642f49253d8848c9e47e8942f5366221 Mon Sep 17 00:00:00 2001 From: Liu Xinpeng Date: Mon, 25 Oct 2021 11:46:26 +0800 Subject: psi: Add a missing SPDX license header Add the missing SPDX license header to include/linux/psi.h include/linux/psi_types.h kernel/sched/psi.c Signed-off-by: Liu Xinpeng Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/1635133586-84611-2-git-send-email-liuxp11@chinatelecom.cn --- include/linux/psi.h | 1 + include/linux/psi_types.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/psi.h b/include/linux/psi.h index 65eb1476ac70..a70ca833c6d7 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PSI_H #define _LINUX_PSI_H diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 0a23300d49af..bf50068d5d4b 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PSI_TYPES_H #define _LINUX_PSI_TYPES_H -- cgit v1.2.3 From 4feee7d12603deca8775f9f9ae5e121093837444 Mon Sep 17 00:00:00 2001 From: Josh Don Date: Mon, 18 Oct 2021 13:34:28 -0700 Subject: sched/core: Forced idle accounting Adds accounting for "forced idle" time, which is time where a cookie'd task forces its SMT sibling to idle, despite the presence of runnable tasks. Forced idle time is one means to measure the cost of enabling core scheduling (ie. the capacity lost due to the need to force idle). Forced idle time is attributed to the thread responsible for causing the forced idle. A few details: - Forced idle time is displayed via /proc/PID/sched. It also requires that schedstats is enabled. - Forced idle is only accounted when a sibling hyperthread is held idle despite the presence of runnable tasks. No time is charged if a sibling is idle but has no runnable tasks. - Tasks with 0 cookie are never charged forced idle. - For SMT > 2, we scale the amount of forced idle charged based on the number of forced idle siblings. Additionally, we split the time up and evenly charge it to all running tasks, as each is equally responsible for the forced idle. Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211018203428.2025792-1-joshdon@google.com --- include/linux/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 78c351e35fec..d2e261adb8ea 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -523,7 +523,11 @@ struct sched_statistics { u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; + +#ifdef CONFIG_SCHED_CORE + u64 core_forceidle_sum; #endif +#endif /* CONFIG_SCHEDSTATS */ } ____cacheline_aligned; struct sched_entity { -- cgit v1.2.3 From cb0e52b7748737b2cf6481fdd9b920ce7e1ebbdf Mon Sep 17 00:00:00 2001 From: Brian Chen Date: Wed, 10 Nov 2021 21:33:12 +0000 Subject: psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim We've noticed cases where tasks in a cgroup are stalled on memory but there is little memory FULL pressure since tasks stay on the runqueue in reclaim. A simple example involves a single threaded program that keeps leaking and touching large amounts of memory. It runs in a cgroup with swap enabled, memory.high set at 10M and cpu.max ratio set at 5%. Though there is significant CPU pressure and memory SOME, there is barely any memory FULL since the task enters reclaim and stays on the runqueue. However, this memory-bound task is effectively stalled on memory and we expect memory FULL to match memory SOME in this scenario. The code is confused about memstall && running, thinking there is a stalled task and a productive task when there's only one task: a reclaimer that's counted as both. To fix this, we redefine the condition for PSI_MEM_FULL to check that all running tasks are in an active memstall instead of checking that there are no running tasks. case PSI_MEM_FULL: - return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]); + return unlikely(tasks[NR_MEMSTALL] && + tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); This will capture reclaimers. It will also capture tasks that called psi_memstall_enter() and are about to sleep, but this should be negligible noise. Signed-off-by: Brian Chen Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20211110213312.310243-1-brianchen118@gmail.com --- include/linux/psi_types.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index bf50068d5d4b..516c0fe836fd 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -22,7 +22,17 @@ enum psi_task_count { * don't have to special case any state tracking for it. */ NR_ONCPU, - NR_PSI_TASK_COUNTS = 4, + /* + * For IO and CPU stalls the presence of running/oncpu tasks + * in the domain means a partial rather than a full stall. + * For memory it's not so simple because of page reclaimers: + * they are running/oncpu while representing a stall. To tell + * whether a domain has productivity left or not, we need to + * distinguish between regular running (i.e. productive) + * threads and memstall ones. + */ + NR_MEMSTALL_RUNNING, + NR_PSI_TASK_COUNTS = 5, }; /* Task state bitmasks */ @@ -30,6 +40,7 @@ enum psi_task_count { #define TSK_MEMSTALL (1 << NR_MEMSTALL) #define TSK_RUNNING (1 << NR_RUNNING) #define TSK_ONCPU (1 << NR_ONCPU) +#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING) /* Resources that workloads could be stalled on */ enum psi_res { -- cgit v1.2.3 From ff083a2d972f56bebfd82409ca62e5dfce950961 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:22 +0000 Subject: perf: Protect perf_guest_cbs with RCU Protect perf_guest_cbs with RCU to fix multiple possible errors. Luckily, all paths that read perf_guest_cbs already require RCU protection, e.g. to protect the callback chains, so only the direct perf_guest_cbs touchpoints need to be modified. Bug #1 is a simple lack of WRITE_ONCE/READ_ONCE behavior to ensure perf_guest_cbs isn't reloaded between a !NULL check and a dereference. Fixed via the READ_ONCE() in rcu_dereference(). Bug #2 is that on weakly-ordered architectures, updates to the callbacks themselves are not guaranteed to be visible before the pointer is made visible to readers. Fixed by the smp_store_release() in rcu_assign_pointer() when the new pointer is non-NULL. Bug #3 is that, because the callbacks are global, it's possible for readers to run in parallel with an unregisters, and thus a module implementing the callbacks can be unloaded while readers are in flight, resulting in a use-after-free. Fixed by a synchronize_rcu() call when unregistering callbacks. Bug #1 escaped notice because it's extremely unlikely a compiler will reload perf_guest_cbs in this sequence. perf_guest_cbs does get reloaded for future derefs, e.g. for ->is_user_mode(), but the ->is_in_guest() guard all but guarantees the consumer will win the race, e.g. to nullify perf_guest_cbs, KVM has to completely exit the guest and teardown down all VMs before KVM start its module unload / unregister sequence. This also makes it all but impossible to encounter bug #3. Bug #2 has not been a problem because all architectures that register callbacks are strongly ordered and/or have a static set of callbacks. But with help, unloading kvm_intel can trigger bug #1 e.g. wrapping perf_guest_cbs with READ_ONCE in perf_misc_flags() while spamming kvm_intel module load/unload leads to: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP CPU: 6 PID: 1825 Comm: stress Not tainted 5.14.0-rc2+ #459 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:perf_misc_flags+0x1c/0x70 Call Trace: perf_prepare_sample+0x53/0x6b0 perf_event_output_forward+0x67/0x160 __perf_event_overflow+0x52/0xf0 handle_pmi_common+0x207/0x300 intel_pmu_handle_irq+0xcf/0x410 perf_event_nmi_handler+0x28/0x50 nmi_handle+0xc7/0x260 default_do_nmi+0x6b/0x170 exc_nmi+0x103/0x130 asm_exc_nmi+0x76/0xbf Fixes: 39447b386c84 ("perf: Enhance perf to allow for guest statistic collection from host") Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211111020738.2512932-2-seanjc@google.com --- include/linux/perf_event.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0dcfd265beed..318c489b735b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1240,7 +1240,18 @@ extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); -extern struct perf_guest_info_callbacks *perf_guest_cbs; +extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; +static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) +{ + /* + * Callbacks are RCU-protected and must be READ_ONCE to avoid reloading + * the callbacks between a !NULL check and dereferences, to ensure + * pending stores/changes to the callback pointers are visible before a + * non-NULL perf_guest_cbs is visible to readers, and to prevent a + * module from unloading callbacks while readers are active. + */ + return rcu_dereference(perf_guest_cbs); +} extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); -- cgit v1.2.3 From 2934e3d09350c1a7ca2433fbeabfcd831e48a575 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:25 +0000 Subject: perf: Stop pretending that perf can handle multiple guest callbacks Drop the 'int' return value from the perf (un)register callbacks helpers and stop pretending perf can support multiple callbacks. The 'int' returns are not future proofing anything as none of the callers take action on an error. It's also not obvious that there will ever be co-tenant hypervisors, and if there are, that allowing multiple callbacks to be registered is desirable or even correct. Opportunistically rename callbacks=>cbs in the affected declarations to match their definitions. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-5-seanjc@google.com --- include/linux/perf_event.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 318c489b735b..98c204488496 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1252,8 +1252,8 @@ static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) */ return rcu_dereference(perf_guest_cbs); } -extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); -extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); +extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); +extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); @@ -1497,10 +1497,10 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void perf_bp_event(struct perf_event *event, void *data) { } -static inline int perf_register_guest_info_callbacks -(struct perf_guest_info_callbacks *callbacks) { return 0; } -static inline int perf_unregister_guest_info_callbacks -(struct perf_guest_info_callbacks *callbacks) { return 0; } +static inline void perf_register_guest_info_callbacks +(struct perf_guest_info_callbacks *cbs) { } +static inline void perf_unregister_guest_info_callbacks +(struct perf_guest_info_callbacks *cbs) { } static inline void perf_event_mmap(struct vm_area_struct *vma) { } -- cgit v1.2.3 From b9f5621c9547dd787900f005a9e1c3d5712de512 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Thu, 11 Nov 2021 02:07:27 +0000 Subject: perf/core: Rework guest callbacks to prepare for static_call support To prepare for using static_calls to optimize perf's guest callbacks, replace ->is_in_guest and ->is_user_mode with a new multiplexed hook ->state, tweak ->handle_intel_pt_intr to play nice with being called when there is no active guest, and drop "guest" from ->get_guest_ip. Return '0' from ->state and ->handle_intel_pt_intr to indicate "not in guest" so that DEFINE_STATIC_CALL_RET0 can be used to define the static calls, i.e. no callback == !guest. [sean: extracted from static_call patch, fixed get_ip() bug, wrote changelog] Suggested-by: Peter Zijlstra (Intel) Originally-by: Peter Zijlstra (Intel) Signed-off-by: Like Xu Signed-off-by: Zhu Lingshan Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Boris Ostrovsky Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-7-seanjc@google.com --- include/linux/perf_event.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 98c204488496..5e6b346d62a7 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -26,11 +26,13 @@ # include #endif +#define PERF_GUEST_ACTIVE 0x01 +#define PERF_GUEST_USER 0x02 + struct perf_guest_info_callbacks { - int (*is_in_guest)(void); - int (*is_user_mode)(void); - unsigned long (*get_guest_ip)(void); - void (*handle_intel_pt_intr)(void); + unsigned int (*state)(void); + unsigned long (*get_ip)(void); + unsigned int (*handle_intel_pt_intr)(void); }; #ifdef CONFIG_HAVE_HW_BREAKPOINT -- cgit v1.2.3 From 1c3430516b0732d923de9fd3bfb3e2e537eeb235 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:28 +0000 Subject: perf: Add wrappers for invoking guest callbacks Add helpers for the guest callbacks to prepare for burying the callbacks behind a Kconfig (it's a lot easier to provide a few stubs than to #ifdef piles of code), and also to prepare for converting the callbacks to static_call(). perf_instruction_pointer() in particular will have subtle semantics with static_call(), as the "no callbacks" case will return 0 if the callbacks are unregistered between querying guest state and getting the IP. Implement the change now to avoid a functional change when adding static_call() support, and because the new helper needs to return _something_ in this case. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-8-seanjc@google.com --- include/linux/perf_event.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5e6b346d62a7..346d5aff5804 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1254,6 +1254,30 @@ static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) */ return rcu_dereference(perf_guest_cbs); } +static inline unsigned int perf_guest_state(void) +{ + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + + return guest_cbs ? guest_cbs->state() : 0; +} +static inline unsigned long perf_guest_get_ip(void) +{ + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + + /* + * Arbitrarily return '0' in the unlikely scenario that the callbacks + * are unregistered between checking guest state and getting the IP. + */ + return guest_cbs ? guest_cbs->get_ip() : 0; +} +static inline unsigned int perf_guest_handle_intel_pt_intr(void) +{ + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + + if (guest_cbs && guest_cbs->handle_intel_pt_intr) + return guest_cbs->handle_intel_pt_intr(); + return 0; +} extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); -- cgit v1.2.3 From 2aef6f306b39bbe74e2287d6e2ee07c4867d87d0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:29 +0000 Subject: perf: Force architectures to opt-in to guest callbacks Introduce GUEST_PERF_EVENTS and require architectures to select it to allow registering and using guest callbacks in perf. This will hopefully make it more difficult for new architectures to add useless "support" for guest callbacks, e.g. via copy+paste. Stubbing out the helpers has the happy bonus of avoiding a load of perf_guest_cbs when GUEST_PERF_EVENTS=n on arm64/x86. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-9-seanjc@google.com --- include/linux/perf_event.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 346d5aff5804..ea47ef616ee0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1242,6 +1242,7 @@ extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); +#ifdef CONFIG_GUEST_PERF_EVENTS extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) { @@ -1280,6 +1281,11 @@ static inline unsigned int perf_guest_handle_intel_pt_intr(void) } extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); +#else +static inline unsigned int perf_guest_state(void) { return 0; } +static inline unsigned long perf_guest_get_ip(void) { return 0; } +static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; } +#endif /* CONFIG_GUEST_PERF_EVENTS */ extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); -- cgit v1.2.3 From 87b940a0675e25261f022ac3e53e0dfff9cdb995 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:30 +0000 Subject: perf/core: Use static_call to optimize perf_guest_info_callbacks Use static_call to optimize perf's guest callbacks on arm64 and x86, which are now the only architectures that define the callbacks. Use DEFINE_STATIC_CALL_RET0 as the default/NULL for all guest callbacks, as the callback semantics are that a return value '0' means "not in guest". static_call obviously avoids the overhead of CONFIG_RETPOLINE=y, but is also advantageous versus other solutions, e.g. per-cpu callbacks, in that a per-cpu memory load is not needed to detect the !guest case. Based on code from Peter and Like. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-10-seanjc@google.com --- include/linux/perf_event.h | 34 ++++++++-------------------------- 1 file changed, 8 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ea47ef616ee0..0ac7d867ca0c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1244,40 +1244,22 @@ extern void perf_event_bpf_event(struct bpf_prog *prog, #ifdef CONFIG_GUEST_PERF_EVENTS extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; -static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) -{ - /* - * Callbacks are RCU-protected and must be READ_ONCE to avoid reloading - * the callbacks between a !NULL check and dereferences, to ensure - * pending stores/changes to the callback pointers are visible before a - * non-NULL perf_guest_cbs is visible to readers, and to prevent a - * module from unloading callbacks while readers are active. - */ - return rcu_dereference(perf_guest_cbs); -} + +DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); +DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); +DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); + static inline unsigned int perf_guest_state(void) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - return guest_cbs ? guest_cbs->state() : 0; + return static_call(__perf_guest_state)(); } static inline unsigned long perf_guest_get_ip(void) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - /* - * Arbitrarily return '0' in the unlikely scenario that the callbacks - * are unregistered between checking guest state and getting the IP. - */ - return guest_cbs ? guest_cbs->get_ip() : 0; + return static_call(__perf_guest_get_ip)(); } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - if (guest_cbs && guest_cbs->handle_intel_pt_intr) - return guest_cbs->handle_intel_pt_intr(); - return 0; + return static_call(__perf_guest_handle_intel_pt_intr)(); } extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); -- cgit v1.2.3 From e1bfc24577cc65c95dc519d7621a9c985b97e567 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:33 +0000 Subject: KVM: Move x86's perf guest info callbacks to generic KVM Move x86's perf guest callbacks into common KVM, as they are semantically identical to arm64's callbacks (the only other such KVM callbacks). arm64 will convert to the common versions in a future patch. Implement the necessary arm64 arch hooks now to avoid having to provide stubs or a temporary #define (from x86) to avoid arm64 compilation errors when CONFIG_GUEST_PERF_EVENTS=y. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20211111020738.2512932-13-seanjc@google.com --- include/linux/kvm_host.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9e0667e3723e..9df7ab2d7530 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1170,6 +1170,16 @@ static inline bool kvm_arch_intc_initialized(struct kvm *kvm) } #endif +#ifdef CONFIG_GUEST_PERF_EVENTS +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu); + +void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)); +void kvm_unregister_perf_callbacks(void); +#else +static inline void kvm_register_perf_callbacks(void *ign) {} +static inline void kvm_unregister_perf_callbacks(void) {} +#endif /* CONFIG_GUEST_PERF_EVENTS */ + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type); void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_arch_sync_events(struct kvm *kvm); -- cgit v1.2.3 From a9f4a6e92b3b319296fb078da2615f618f6cd80c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:38 +0000 Subject: perf: Drop guest callback (un)register stubs Drop perf's stubs for (un)registering guest callbacks now that KVM registration of callbacks is hidden behind GUEST_PERF_EVENTS=y. The only other user is x86 XEN_PV, and x86 unconditionally selects PERF_EVENTS. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-18-seanjc@google.com --- include/linux/perf_event.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0ac7d867ca0c..7b7525e9155f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1511,11 +1511,6 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void perf_bp_event(struct perf_event *event, void *data) { } -static inline void perf_register_guest_info_callbacks -(struct perf_guest_info_callbacks *cbs) { } -static inline void perf_unregister_guest_info_callbacks -(struct perf_guest_info_callbacks *cbs) { } - static inline void perf_event_mmap(struct vm_area_struct *vma) { } typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); -- cgit v1.2.3 From f45b2974cc0ae959a4c503a071e38a56bd64372f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 17 Nov 2021 13:57:08 +0100 Subject: bpf, x86: Fix "no previous prototype" warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The arch_prepare_bpf_dispatcher function does not have a prototype, and yields the following warning when W=1 is enabled for the kernel build. >> arch/x86/net/bpf_jit_comp.c:2188:5: warning: no previous \ prototype for 'arch_prepare_bpf_dispatcher' [-Wmissing-prototypes] 2188 | int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, \ int num_funcs) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~ Remove the warning by adding a function declaration to include/linux/bpf.h. Fixes: 75ccbef6369e ("bpf: Introduce BPF dispatcher") Reported-by: kernel test robot Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211117125708.769168-1-bjorn@kernel.org --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e7a163a3146b..84ff6ef49462 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -732,6 +732,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); +int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs); #define BPF_DISPATCHER_INIT(_name) { \ .mutex = __MUTEX_INITIALIZER(_name.mutex), \ .func = &_name##_func, \ -- cgit v1.2.3 From cf9acc90c80ecbee00334aa85d92f4e74014bcff Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Tue, 16 Nov 2021 17:42:42 +0000 Subject: net: virtio_net_hdr_to_skb: count transport header in UFO virtio_net_hdr_to_skb does not set the skb's gso_size and gso_type correctly for UFO packets received via virtio-net that are a little over the GSO size. This can lead to problems elsewhere in the networking stack, e.g. ovs_vport_send dropping over-sized packets if gso_size is not set. This is due to the comparison if (skb->len - p_off > gso_size) not properly accounting for the transport layer header. p_off includes the size of the transport layer header (thlen), so skb->len - p_off is the size of the TCP/UDP payload. gso_size is read from the virtio-net header. For UFO, fragmentation happens at the IP level so does not need to include the UDP header. Hence the calculation could be comparing a TCP/UDP payload length with an IP payload length, causing legitimate virtio-net packets to have lack gso_type/gso_size information. Example: a UDP packet with payload size 1473 has IP payload size 1481. If the guest used UFO, it is not fragmented and the virtio-net header's flags indicate that it is a GSO frame (VIRTIO_NET_HDR_GSO_UDP), with gso_size = 1480 for an MTU of 1500. skb->len will be 1515 and p_off will be 42, so skb->len - p_off = 1473. Hence the comparison fails, and shinfo->gso_size and gso_type are not set as they should be. Instead, add the UDP header length before comparing to gso_size when using UFO. In this way, it is the size of the IP payload that is compared to gso_size. Fixes: 6dd912f82680 ("net: check untrusted gso_size at kernel entry") Signed-off-by: Jonathan Davies Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index b465f8f3e554..04e87f4b9417 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -120,10 +120,15 @@ retry: if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size); + unsigned int nh_off = p_off; struct skb_shared_info *shinfo = skb_shinfo(skb); + /* UFO may not include transport header in gso_size. */ + if (gso_type & SKB_GSO_UDP) + nh_off -= thlen; + /* Too small packets are not really GSO ones. */ - if (skb->len - p_off > gso_size) { + if (skb->len - nh_off > gso_size) { shinfo->gso_size = gso_size; shinfo->gso_type = gso_type; -- cgit v1.2.3 From 8160fb43d55d26d64607fd32fe69185a5f5fe41f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Nov 2021 19:29:21 -0800 Subject: net: use an atomic_long_t for queue->trans_timeout tx_timeout_show() assumed dev_watchdog() would stop all the queues, to fetch queue->trans_timeout under protection of the queue->_xmit_lock. As we want to no longer disrupt transmits, we use an atomic_long_t instead. Signed-off-by: Eric Dumazet Cc: david decotigny Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dd328364dfe9..1d22483cf78c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -592,7 +592,7 @@ struct netdev_queue { * Number of TX timeouts for this queue * (/sys/class/net/DEV/Q/trans_timeout) */ - unsigned long trans_timeout; + atomic_long_t trans_timeout; /* Subordinate device that the queue has been assigned to */ struct net_device *sb_dev; -- cgit v1.2.3 From 5337824f4dc4bb26f38fbbba4ffb425a92803f15 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Nov 2021 19:29:22 -0800 Subject: net: annotate accesses to queue->trans_start In following patches, dev_watchdog() will no longer stop all queues. It will read queue->trans_start locklessly. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1d22483cf78c..279409ef2b18 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4095,10 +4095,21 @@ static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) spin_unlock_bh(&txq->_xmit_lock); } +/* + * txq->trans_start can be read locklessly from dev_watchdog() + */ static inline void txq_trans_update(struct netdev_queue *txq) { if (txq->xmit_lock_owner != -1) - txq->trans_start = jiffies; + WRITE_ONCE(txq->trans_start, jiffies); +} + +static inline void txq_trans_cond_update(struct netdev_queue *txq) +{ + unsigned long now = jiffies; + + if (READ_ONCE(txq->trans_start) != now) + WRITE_ONCE(txq->trans_start, now); } /* legacy drivers only, netdev_start_xmit() sets txq->trans_start */ @@ -4106,8 +4117,7 @@ static inline void netif_trans_update(struct net_device *dev) { struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); - if (txq->trans_start != jiffies) - txq->trans_start = jiffies; + txq_trans_cond_update(txq); } /** -- cgit v1.2.3 From dab8fe320726b38a6b1dc6a7ca6e386c5f7779e8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Nov 2021 19:29:23 -0800 Subject: net: do not inline netif_tx_lock()/netif_tx_unlock() These are not fast path, there is no point in inlining them. Also provide netif_freeze_queues()/netif_unfreeze_queues() so that we can use them from dev_watchdog() in the following patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 39 ++------------------------------------- 1 file changed, 2 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 279409ef2b18..4f4a299e92de 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4126,27 +4126,7 @@ static inline void netif_trans_update(struct net_device *dev) * * Get network device transmit lock */ -static inline void netif_tx_lock(struct net_device *dev) -{ - unsigned int i; - int cpu; - - spin_lock(&dev->tx_global_lock); - cpu = smp_processor_id(); - for (i = 0; i < dev->num_tx_queues; i++) { - struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - - /* We are the only thread of execution doing a - * freeze, but we have to grab the _xmit_lock in - * order to synchronize with threads which are in - * the ->hard_start_xmit() handler and already - * checked the frozen bit. - */ - __netif_tx_lock(txq, cpu); - set_bit(__QUEUE_STATE_FROZEN, &txq->state); - __netif_tx_unlock(txq); - } -} +void netif_tx_lock(struct net_device *dev); static inline void netif_tx_lock_bh(struct net_device *dev) { @@ -4154,22 +4134,7 @@ static inline void netif_tx_lock_bh(struct net_device *dev) netif_tx_lock(dev); } -static inline void netif_tx_unlock(struct net_device *dev) -{ - unsigned int i; - - for (i = 0; i < dev->num_tx_queues; i++) { - struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - - /* No need to grab the _xmit_lock here. If the - * queue is not stopped for another reason, we - * force a schedule. - */ - clear_bit(__QUEUE_STATE_FROZEN, &txq->state); - netif_schedule_queue(txq); - } - spin_unlock(&dev->tx_global_lock); -} +void netif_tx_unlock(struct net_device *dev); static inline void netif_tx_unlock_bh(struct net_device *dev) { -- cgit v1.2.3 From 1881eadb2041889d74d60c074eb04189c4a07dad Mon Sep 17 00:00:00 2001 From: Abhyuday Godhasara Date: Mon, 25 Oct 2021 21:25:20 -0700 Subject: firmware: xilinx: add register notifier in zynqmp firmware In zynqmp-firmware, register notifier is not supported, add support of register notifier in zynqmp-firmware. Acked-by: Michal Simek Signed-off-by: Tejas Patel Signed-off-by: Abhyuday Godhasara Link: https://lore.kernel.org/r/20211026042525.26612-2-abhyuday.godhasara@xilinx.com Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware/xlnx-zynqmp.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 47fd4e52a423..d30d39dc8cb4 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -2,7 +2,7 @@ /* * Xilinx Zynq MPSoC Firmware layer * - * Copyright (C) 2014-2019 Xilinx + * Copyright (C) 2014-2021 Xilinx * * Michal Simek * Davorin Mista @@ -66,6 +66,7 @@ enum pm_api_id { PM_GET_API_VERSION = 1, + PM_REGISTER_NOTIFIER = 5, PM_SYSTEM_SHUTDOWN = 12, PM_REQUEST_NODE = 13, PM_RELEASE_NODE = 14, @@ -427,6 +428,8 @@ int zynqmp_pm_pinctrl_get_config(const u32 pin, const u32 param, int zynqmp_pm_pinctrl_set_config(const u32 pin, const u32 param, u32 value); int zynqmp_pm_load_pdi(const u32 src, const u64 address); +int zynqmp_pm_register_notifier(const u32 node, const u32 event, + const u32 wake, const u32 enable); #else static inline int zynqmp_pm_get_api_version(u32 *version) { @@ -658,6 +661,12 @@ static inline int zynqmp_pm_load_pdi(const u32 src, const u64 address) { return -ENODEV; } + +static inline int zynqmp_pm_register_notifier(const u32 node, const u32 event, + const u32 wake, const u32 enable) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From fbce9f14055e547d270046f61758c29c957e675d Mon Sep 17 00:00:00 2001 From: Abhyuday Godhasara Date: Mon, 25 Oct 2021 21:25:21 -0700 Subject: firmware: xilinx: add macros of node ids for error event Add macros for the Node-Id of Error events. Move supported api callback ids from zynqmp-power to zynqmp-firmware. Acked-by: Michal Simek Signed-off-by: Rajan Vaja Signed-off-by: Abhyuday Godhasara Link: https://lore.kernel.org/r/20211026042525.26612-3-abhyuday.godhasara@xilinx.com Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware/xlnx-zynqmp.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index d30d39dc8cb4..b0a38091db71 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -64,6 +64,20 @@ #define XILINX_ZYNQMP_PM_FPGA_FULL 0x0U #define XILINX_ZYNQMP_PM_FPGA_PARTIAL BIT(0) +/* + * Node IDs for the Error Events. + */ +#define EVENT_ERROR_PMC_ERR1 (0x28100000U) +#define EVENT_ERROR_PMC_ERR2 (0x28104000U) +#define EVENT_ERROR_PSM_ERR1 (0x28108000U) +#define EVENT_ERROR_PSM_ERR2 (0x2810C000U) + +enum pm_api_cb_id { + PM_INIT_SUSPEND_CB = 30, + PM_ACKNOWLEDGE_CB = 31, + PM_NOTIFY_CB = 32, +}; + enum pm_api_id { PM_GET_API_VERSION = 1, PM_REGISTER_NOTIFIER = 5, -- cgit v1.2.3 From f4d77525679e289d4976ca03b620ac4cc5403205 Mon Sep 17 00:00:00 2001 From: Abhyuday Godhasara Date: Mon, 25 Oct 2021 21:25:22 -0700 Subject: firmware: xilinx: export the feature check of zynqmp firmware Export the zynqmp_pm_feature(), so it can be use by other as to get API version available in firmware. Acked-by: Michal Simek Signed-off-by: Rajan Vaja Signed-off-by: Abhyuday Godhasara Link: https://lore.kernel.org/r/20211026042525.26612-4-abhyuday.godhasara@xilinx.com Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware/xlnx-zynqmp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index b0a38091db71..077e894bb340 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -444,6 +444,7 @@ int zynqmp_pm_pinctrl_set_config(const u32 pin, const u32 param, int zynqmp_pm_load_pdi(const u32 src, const u64 address); int zynqmp_pm_register_notifier(const u32 node, const u32 event, const u32 wake, const u32 enable); +int zynqmp_pm_feature(const u32 api_id); #else static inline int zynqmp_pm_get_api_version(u32 *version) { @@ -681,6 +682,11 @@ static inline int zynqmp_pm_register_notifier(const u32 node, const u32 event, { return -ENODEV; } + +static inline int zynqmp_pm_feature(const u32 api_id) +{ + return -ENODEV; +} #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From 522a0032af005502507f5f81ae64fdcc82b5d068 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 6 Nov 2021 17:13:35 -0400 Subject: Add linux/cacheflush.h Many architectures do not include asm-generic/cacheflush.h, so turn the includes on their head and add linux/cacheflush.h which includes asm/cacheflush.h. Move the flush_dcache_folio() declaration from asm-generic/cacheflush.h to linux/cacheflush.h and change linux/highmem.h to include linux/cacheflush.h instead of asm/cacheflush.h so that all necessary places will see flush_dcache_folio(). More functions should have their default implementations moved in the future, but those are for follow-on patches. This fixes csky, sparc and sparc64 which were missed in the commit which added flush_dcache_folio(). Fixes: 08b0b0059bf1 ("mm: Add flush_dcache_folio()") Suggested-by: Christoph Hellwig Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Geert Uytterhoeven --- include/linux/cacheflush.h | 18 ++++++++++++++++++ include/linux/highmem.h | 3 +-- 2 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 include/linux/cacheflush.h (limited to 'include/linux') diff --git a/include/linux/cacheflush.h b/include/linux/cacheflush.h new file mode 100644 index 000000000000..fef8b607f97e --- /dev/null +++ b/include/linux/cacheflush.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CACHEFLUSH_H +#define _LINUX_CACHEFLUSH_H + +#include + +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO +void flush_dcache_folio(struct folio *folio); +#endif +#else +static inline void flush_dcache_folio(struct folio *folio) +{ +} +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO 0 +#endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */ + +#endif /* _LINUX_CACHEFLUSH_H */ diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 25aff0f2ed0b..c944b3b70ee7 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -5,12 +5,11 @@ #include #include #include +#include #include #include #include -#include - #include "highmem-internal.h" /** -- cgit v1.2.3 From 9c3252152e8a6401c2b9e32490a5a16ec4472778 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 16 Nov 2021 21:17:14 -0500 Subject: mm: Rename folio_test_multi to folio_test_large This is a better name. Also add kernel-doc. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/page-flags.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 52ec4b5e5615..05510118fbb8 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -692,7 +692,13 @@ static inline bool folio_test_single(struct folio *folio) return !folio_test_head(folio); } -static inline bool folio_test_multi(struct folio *folio) +/** + * folio_test_large() - Does this folio contain more than one page? + * @folio: The folio to test. + * + * Return: True if the folio is larger than one page. + */ +static inline bool folio_test_large(struct folio *folio) { return folio_test_head(folio); } -- cgit v1.2.3 From a1efe484dd8c04c4c2d4eb1ee6b04d01cfc07ccc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 16 Nov 2021 21:18:52 -0500 Subject: mm: Remove folio_test_single There's no need for this predicate; callers can just use !folio_test_large(). Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/page-flags.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 05510118fbb8..b5f14d581113 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -686,12 +686,6 @@ static inline bool test_set_page_writeback(struct page *page) __PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) -/* Whether there are one or multiple pages in a folio */ -static inline bool folio_test_single(struct folio *folio) -{ - return !folio_test_head(folio); -} - /** * folio_test_large() - Does this folio contain more than one page? * @folio: The folio to test. -- cgit v1.2.3 From ff36da69bc90d80b0c73f47f4b2e270b3ff6da99 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 29 Aug 2021 06:07:03 -0400 Subject: fs: Remove FS_THP_SUPPORT Instead of setting a bit in the fs_flags to set a bit in the address_space, set the bit in the address_space directly. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- include/linux/fs.h | 1 - include/linux/pagemap.h | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1cb616fc1105..bbf812ce89a8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2518,7 +2518,6 @@ struct file_system_type { #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ -#define FS_THP_SUPPORT 8192 /* Remove once all fs converted */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 1a0c646eb6ff..9e33878bf23b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -176,6 +176,22 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) m->gfp_mask = mask; } +/** + * mapping_set_large_folios() - Indicate the file supports large folios. + * @mapping: The file. + * + * The filesystem should call this function in its inode constructor to + * indicate that the VFS can use large folios to cache the contents of + * the file. + * + * Context: This should not be called while the inode is active as it + * is non-atomic. + */ +static inline void mapping_set_large_folios(struct address_space *mapping) +{ + __set_bit(AS_THP_SUPPORT, &mapping->flags); +} + static inline bool mapping_thp_support(struct address_space *mapping) { return test_bit(AS_THP_SUPPORT, &mapping->flags); -- cgit v1.2.3 From ed2145c474c9015bc634e35f6d1a9b7767f3fbfc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 29 Aug 2021 06:28:19 -0400 Subject: fs: Rename AS_THP_SUPPORT and mapping_thp_support These are now indicators of large folio support, not THP support. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/pagemap.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 9e33878bf23b..605246452305 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -84,7 +84,7 @@ enum mapping_flags { AS_EXITING = 4, /* final truncate in progress */ /* writeback related tags are not used */ AS_NO_WRITEBACK_TAGS = 5, - AS_THP_SUPPORT = 6, /* THPs supported */ + AS_LARGE_FOLIO_SUPPORT = 6, }; /** @@ -189,12 +189,12 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) */ static inline void mapping_set_large_folios(struct address_space *mapping) { - __set_bit(AS_THP_SUPPORT, &mapping->flags); + __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); } -static inline bool mapping_thp_support(struct address_space *mapping) +static inline bool mapping_large_folio_support(struct address_space *mapping) { - return test_bit(AS_THP_SUPPORT, &mapping->flags); + return test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); } static inline int filemap_nr_thps(struct address_space *mapping) @@ -209,7 +209,7 @@ static inline int filemap_nr_thps(struct address_space *mapping) static inline void filemap_nr_thps_inc(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS - if (!mapping_thp_support(mapping)) + if (!mapping_large_folio_support(mapping)) atomic_inc(&mapping->nr_thps); #else WARN_ON_ONCE(1); @@ -219,7 +219,7 @@ static inline void filemap_nr_thps_inc(struct address_space *mapping) static inline void filemap_nr_thps_dec(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS - if (!mapping_thp_support(mapping)) + if (!mapping_large_folio_support(mapping)) atomic_dec(&mapping->nr_thps); #else WARN_ON_ONCE(1); -- cgit v1.2.3 From 5768d8906bc23d512b1a736c1e198aa833a6daa4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 15 Nov 2021 13:47:13 -0600 Subject: signal: Requeue signals in the appropriate queue In the event that a tracer changes which signal needs to be delivered and that signal is currently blocked then the signal needs to be requeued for later delivery. With the advent of CLONE_THREAD the kernel has 2 signal queues per task. The per process queue and the per task queue. Update the code so that if the signal is removed from the per process queue it is requeued on the per process queue. This is necessary to make it appear the signal was never dequeued. The rr debugger reasonably believes that the state of the process from the last ptrace_stop it observed until PTRACE_EVENT_EXIT can be recreated by simply letting a process run. If a SIGKILL interrupts a ptrace_stop this is not true today. So return signals to their original queue in ptrace_signal so that signals that are not delivered appear like they were never dequeued. Fixes: 794aa320b79d ("[PATCH] sigfix-2.5.40-D6") History Tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.gi Reviewed-by: Kees Cook Link: https://lkml.kernel.org/r/87zgq4d5r4.fsf_-_@email.froward.int.ebiederm.org Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 23505394ef70..167995d471da 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -286,17 +286,18 @@ static inline int signal_group_exit(const struct signal_struct *sig) extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); -extern int dequeue_signal(struct task_struct *task, - sigset_t *mask, kernel_siginfo_t *info); +extern int dequeue_signal(struct task_struct *task, sigset_t *mask, + kernel_siginfo_t *info, enum pid_type *type); static inline int kernel_dequeue_signal(void) { struct task_struct *task = current; kernel_siginfo_t __info; + enum pid_type __type; int ret; spin_lock_irq(&task->sighand->siglock); - ret = dequeue_signal(task, &task->blocked, &__info); + ret = dequeue_signal(task, &task->blocked, &__info, &__type); spin_unlock_irq(&task->sighand->siglock); return ret; -- cgit v1.2.3 From e0dbd7b0ed021fb9250f7ba4d759325678efefb5 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 16 Nov 2021 23:44:28 +0100 Subject: power: supply: core: Add kerneldoc to battery struct This complements the struct power_supply_battery_info with extensive kerneldoc explaining the different semantics of the fields, including an overview of the CC/CV charging concepts implicit in some of the struct members. This is done to first establish semantics before I can add more charging methods by breaking out the CC/CV parameters to its own struct. Tested-by: Randy Dunlap Acked-by: Randy Dunlap Reviewed-by: Matti Vaittinen Signed-off-by: Linus Walleij Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 215 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 192 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 2d1318fe2455..f6e94eae4f28 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -343,37 +343,206 @@ struct power_supply_resistance_temp_table { #define POWER_SUPPLY_OCV_TEMP_MAX 20 -/* +/** + * struct power_supply_battery_info - information about batteries + * @technology: from the POWER_SUPPLY_TECHNOLOGY_* enum + * @energy_full_design_uwh: energy content when fully charged in microwatt + * hours + * @charge_full_design_uah: charge content when fully charged in microampere + * hours + * @voltage_min_design_uv: minimum voltage across the poles when the battery + * is at minimum voltage level in microvolts. If the voltage drops below this + * level the battery will need precharging when using CC/CV charging. + * @voltage_max_design_uv: voltage across the poles when the battery is fully + * charged in microvolts. This is the "nominal voltage" i.e. the voltage + * printed on the label of the battery. + * @tricklecharge_current_ua: the tricklecharge current used when trickle + * charging the battery in microamperes. This is the charging phase when the + * battery is completely empty and we need to carefully trickle in some + * charge until we reach the precharging voltage. + * @precharge_current_ua: current to use in the precharge phase in microamperes, + * the precharge rate is limited by limiting the current to this value. + * @precharge_voltage_max_uv: the maximum voltage allowed when precharging in + * microvolts. When we pass this voltage we will nominally switch over to the + * CC (constant current) charging phase defined by constant_charge_current_ua + * and constant_charge_voltage_max_uv. + * @charge_term_current_ua: when the current in the CV (constant voltage) + * charging phase drops below this value in microamperes the charging will + * terminate completely and not restart until the voltage over the battery + * poles reach charge_restart_voltage_uv unless we use maintenance charging. + * @charge_restart_voltage_uv: when the battery has been fully charged by + * CC/CV charging and charging has been disabled, and the voltage subsequently + * drops below this value in microvolts, the charging will be restarted + * (typically using CV charging). + * @overvoltage_limit_uv: If the voltage exceeds the nominal voltage + * voltage_max_design_uv and we reach this voltage level, all charging must + * stop and emergency procedures take place, such as shutting down the system + * in some cases. + * @constant_charge_current_max_ua: current in microamperes to use in the CC + * (constant current) charging phase. The charging rate is limited + * by this current. This is the main charging phase and as the current is + * constant into the battery the voltage slowly ascends to + * constant_charge_voltage_max_uv. + * @constant_charge_voltage_max_uv: voltage in microvolts signifying the end of + * the CC (constant current) charging phase and the beginning of the CV + * (constant voltage) charging phase. + * @factory_internal_resistance_uohm: the internal resistance of the battery + * at fabrication time, expressed in microohms. This resistance will vary + * depending on the lifetime and charge of the battery, so this is just a + * nominal ballpark figure. + * @ocv_temp: array indicating the open circuit voltage (OCV) capacity + * temperature indices. This is an array of temperatures in degrees Celsius + * indicating which capacity table to use for a certain temperature, since + * the capacity for reasons of chemistry will be different at different + * temperatures. Determining capacity is a multivariate problem and the + * temperature is the first variable we determine. + * @temp_ambient_alert_min: the battery will go outside of operating conditions + * when the ambient temperature goes below this temperature in degrees + * Celsius. + * @temp_ambient_alert_max: the battery will go outside of operating conditions + * when the ambient temperature goes above this temperature in degrees + * Celsius. + * @temp_alert_min: the battery should issue an alert if the internal + * temperature goes below this temperature in degrees Celsius. + * @temp_alert_max: the battery should issue an alert if the internal + * temperature goes above this temperature in degrees Celsius. + * @temp_min: the battery will go outside of operating conditions when + * the internal temperature goes below this temperature in degrees Celsius. + * Normally this means the system should shut down. + * @temp_max: the battery will go outside of operating conditions when + * the internal temperature goes above this temperature in degrees Celsius. + * Normally this means the system should shut down. + * @ocv_table: for each entry in ocv_temp there is a corresponding entry in + * ocv_table and a size for each entry in ocv_table_size. These arrays + * determine the capacity in percent in relation to the voltage in microvolts + * at the indexed temperature. + * @ocv_table_size: for each entry in ocv_temp this array is giving the size of + * each entry in the array of capacity arrays in ocv_table. + * @resist_table: this is a table that correlates a battery temperature to the + * expected internal resistance at this temperature. The resistance is given + * as a percentage of factory_internal_resistance_uohm. Knowing the + * resistance of the battery is usually necessary for calculating the open + * circuit voltage (OCV) that is then used with the ocv_table to calculate + * the capacity of the battery. The resist_table must be ordered descending + * by temperature: highest temperature with lowest resistance first, lowest + * temperature with highest resistance last. + * @resist_table_size: the number of items in the resist_table. + * * This is the recommended struct to manage static battery parameters, * populated by power_supply_get_battery_info(). Most platform drivers should * use these for consistency. + * * Its field names must correspond to elements in enum power_supply_property. * The default field value is -EINVAL. - * Power supply class itself doesn't use this. + * + * The charging parameters here assume a CC/CV charging scheme. This method + * is most common with Lithium Ion batteries (other methods are possible) and + * looks as follows: + * + * ^ Battery voltage + * | --- overvoltage_limit_uv + * | + * | ................................................... + * | .. constant_charge_voltage_max_uv + * | .. + * | . + * | . + * | . + * | . + * | . + * | .. precharge_voltage_max_uv + * | .. + * |. (trickle charging) + * +------------------------------------------------------------------> time + * + * ^ Current into the battery + * | + * | ............. constant_charge_current_max_ua + * | . . + * | . . + * | . . + * | . . + * | . .. + * | . .... + * | . ..... + * | ... precharge_current_ua ....... charge_term_current_ua + * | . . + * | . . + * |.... tricklecharge_current_ua . + * | . + * +-----------------------------------------------------------------> time + * + * These diagrams are synchronized on time and the voltage and current + * follow each other. + * + * With CC/CV charging commence over time like this for an empty battery: + * + * 1. When the battery is completely empty it may need to be charged with + * an especially small current so that electrons just "trickle in", + * this is the tricklecharge_current_ua. + * + * 2. Next a small initial pre-charge current (precharge_current_ua) + * is applied if the voltage is below precharge_voltage_max_uv until we + * reach precharge_voltage_max_uv. CAUTION: in some texts this is referred + * to as "trickle charging" but the use in the Linux kernel is different + * see below! + * + * 3. Then the main charging current is applied, which is called the constant + * current (CC) phase. A current regulator is set up to allow + * constant_charge_current_max_ua of current to flow into the battery. + * The chemical reaction in the battery will make the voltage go up as + * charge goes into the battery. This current is applied until we reach + * the constant_charge_voltage_max_uv voltage. + * + * 4. At this voltage we switch over to the constant voltage (CV) phase. This + * means we allow current to go into the battery, but we keep the voltage + * fixed. This current will continue to charge the battery while keeping + * the voltage the same. A chemical reaction in the battery goes on + * storing energy without affecting the voltage. Over time the current + * will slowly drop and when we reach charge_term_current_ua we will + * end the constant voltage phase. + * + * After this the battery is fully charged, and if we do not support maintenance + * charging, the charging will not restart until power dissipation makes the + * voltage fall so that we reach charge_restart_voltage_uv and at this point + * we restart charging at the appropriate phase, usually this will be inside + * the CV phase. + * + * If we support maintenance charging the voltage is however kept high after + * the CV phase with a very low current. This is meant to let the same charge + * go in for usage while the charger is still connected, mainly for + * dissipation for the power consuming entity while connected to the + * charger. + * + * All charging MUST terminate if the overvoltage_limit_uv is ever reached. + * Overcharging Lithium Ion cells can be DANGEROUS and lead to fire or + * explosions. + * + * The power supply class itself doesn't use this struct as of now. */ struct power_supply_battery_info { - unsigned int technology; /* from the enum above */ - int energy_full_design_uwh; /* microWatt-hours */ - int charge_full_design_uah; /* microAmp-hours */ - int voltage_min_design_uv; /* microVolts */ - int voltage_max_design_uv; /* microVolts */ - int tricklecharge_current_ua; /* microAmps */ - int precharge_current_ua; /* microAmps */ - int precharge_voltage_max_uv; /* microVolts */ - int charge_term_current_ua; /* microAmps */ - int charge_restart_voltage_uv; /* microVolts */ - int overvoltage_limit_uv; /* microVolts */ - int constant_charge_current_max_ua; /* microAmps */ - int constant_charge_voltage_max_uv; /* microVolts */ - int factory_internal_resistance_uohm; /* microOhms */ - int ocv_temp[POWER_SUPPLY_OCV_TEMP_MAX];/* celsius */ - int temp_ambient_alert_min; /* celsius */ - int temp_ambient_alert_max; /* celsius */ - int temp_alert_min; /* celsius */ - int temp_alert_max; /* celsius */ - int temp_min; /* celsius */ - int temp_max; /* celsius */ + unsigned int technology; + int energy_full_design_uwh; + int charge_full_design_uah; + int voltage_min_design_uv; + int voltage_max_design_uv; + int tricklecharge_current_ua; + int precharge_current_ua; + int precharge_voltage_max_uv; + int charge_term_current_ua; + int charge_restart_voltage_uv; + int overvoltage_limit_uv; + int constant_charge_current_max_ua; + int constant_charge_voltage_max_uv; + int factory_internal_resistance_uohm; + int ocv_temp[POWER_SUPPLY_OCV_TEMP_MAX]; + int temp_ambient_alert_min; + int temp_ambient_alert_max; + int temp_alert_min; + int temp_alert_max; + int temp_min; + int temp_max; struct power_supply_battery_ocv_table *ocv_table[POWER_SUPPLY_OCV_TEMP_MAX]; int ocv_table_size[POWER_SUPPLY_OCV_TEMP_MAX]; struct power_supply_resistance_temp_table *resist_table; -- cgit v1.2.3 From 6bb835f3d00467c9a5e35f4955afa29df96a404e Mon Sep 17 00:00:00 2001 From: Andriy Tryshnivskyy Date: Sun, 24 Oct 2021 12:16:26 +0300 Subject: iio: core: Introduce IIO_VAL_INT_64. Introduce IIO_VAL_INT_64 to read 64-bit value for channel attribute. Val is used as lower 32 bits. Signed-off-by: Andriy Tryshnivskyy Link: https://lore.kernel.org/r/20211024091627.28031-2-andriy.tryshnivskyy@opensynergy.com Signed-off-by: Jonathan Cameron --- include/linux/iio/types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h index 84b3f8175cc6..a7aa91f3a8dc 100644 --- a/include/linux/iio/types.h +++ b/include/linux/iio/types.h @@ -24,6 +24,7 @@ enum iio_event_info { #define IIO_VAL_INT_PLUS_NANO 3 #define IIO_VAL_INT_PLUS_MICRO_DB 4 #define IIO_VAL_INT_MULTIPLE 5 +#define IIO_VAL_INT_64 6 /* 64-bit data, val is lower 32 bits */ #define IIO_VAL_FRACTIONAL 10 #define IIO_VAL_FRACTIONAL_LOG2 11 #define IIO_VAL_CHAR 12 -- cgit v1.2.3 From 2925748eadc33cba3bded7b69475a1b002b124ac Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Wed, 17 Nov 2021 13:22:53 +0000 Subject: firmware: cs_dsp: Add version checks on coefficient loading The firmware coefficient files contain version information that is currently ignored by the cs_dsp code. This information specifies which version of the firmware the coefficient were generated for. Add a check into the code which prints a warning in the case the coefficient and firmware differ in version, in many cases this will be ok but it is not always, so best to let the user know there is a potential issue. Co-authored-by: Simon Trimmer Signed-off-by: Simon Trimmer Signed-off-by: Charles Keepax Link: https://lore.kernel.org/r/20211117132300.1290-3-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h index 3a54b1afc48f..ce54705e2bec 100644 --- a/include/linux/firmware/cirrus/cs_dsp.h +++ b/include/linux/firmware/cirrus/cs_dsp.h @@ -54,12 +54,14 @@ struct cs_dsp_region { * struct cs_dsp_alg_region - Describes a logical algorithm region in DSP address space * @list: List node for internal use * @alg: Algorithm id + * @ver: Expected algorithm version * @type: Memory region type * @base: Address of region */ struct cs_dsp_alg_region { struct list_head list; unsigned int alg; + unsigned int ver; int type; unsigned int base; }; -- cgit v1.2.3 From 14055b5a3a23204c4702ae5d3f2a819ee081ce33 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Wed, 17 Nov 2021 13:22:54 +0000 Subject: firmware: cs_dsp: Add pre_run callback The code already has a post_run callback, add a matching pre_run callback to the client_ops that is called before execution is started. This callback provides a convenient place for the client code to set DSP controls or hardware that requires configuration before the DSP core actually starts execution. Note that placing this callback before cs_dsp_coeff_sync_controls is important to ensure that any control values are then correctly synced out to the chip. Co-authored-by: Simon Trimmer Signed-off-by: Simon Trimmer Signed-off-by: Charles Keepax Link: https://lore.kernel.org/r/20211117132300.1290-4-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h index ce54705e2bec..0bf849baeaa5 100644 --- a/include/linux/firmware/cirrus/cs_dsp.h +++ b/include/linux/firmware/cirrus/cs_dsp.h @@ -187,7 +187,8 @@ struct cs_dsp { * struct cs_dsp_client_ops - client callbacks * @control_add: Called under the pwr_lock when a control is created * @control_remove: Called under the pwr_lock when a control is destroyed - * @post_run: Called under the pwr_lock by cs_dsp_run() + * @pre_run: Called under the pwr_lock by cs_dsp_run() before the core is started + * @post_run: Called under the pwr_lock by cs_dsp_run() after the core is started * @post_stop: Called under the pwr_lock by cs_dsp_stop() * @watchdog_expired: Called when a watchdog expiry is detected * @@ -197,6 +198,7 @@ struct cs_dsp { struct cs_dsp_client_ops { int (*control_add)(struct cs_dsp_coeff_ctl *ctl); void (*control_remove)(struct cs_dsp_coeff_ctl *ctl); + int (*pre_run)(struct cs_dsp *dsp); int (*post_run)(struct cs_dsp *dsp); void (*post_stop)(struct cs_dsp *dsp); void (*watchdog_expired)(struct cs_dsp *dsp); -- cgit v1.2.3 From b329b3d39497a9fdb175d7e4fd77ae7170d5d26c Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Wed, 17 Nov 2021 13:22:58 +0000 Subject: firmware: cs_dsp: Clarify some kernel doc comments Signed-off-by: Charles Keepax Link: https://lore.kernel.org/r/20211117132300.1290-8-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h index 0bf849baeaa5..1ad1b173417a 100644 --- a/include/linux/firmware/cirrus/cs_dsp.h +++ b/include/linux/firmware/cirrus/cs_dsp.h @@ -76,8 +76,8 @@ struct cs_dsp_alg_region { * @enabled: Flag indicating whether control is enabled * @list: List node for internal use * @cache: Cached value of the control - * @offset: Offset of control within alg_region - * @len: Length of the cached value + * @offset: Offset of control within alg_region in words + * @len: Length of the cached value in bytes * @set: Flag indicating the value has been written by the user * @flags: Bitfield of WMFW_CTL_FLAG_ control flags defined in wmfw.h * @type: One of the WMFW_CTL_TYPE_ control types defined in wmfw.h -- cgit v1.2.3 From f444da38ac924748de696c393327a44c4b8d727e Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Wed, 17 Nov 2021 13:22:59 +0000 Subject: firmware: cs_dsp: Add offset to cs_dsp read/write Provide a mechanism to access only part of a control through the cs_dsp interface. Signed-off-by: Charles Keepax Link: https://lore.kernel.org/r/20211117132300.1290-9-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h index 1ad1b173417a..38b4da3ddfe4 100644 --- a/include/linux/firmware/cirrus/cs_dsp.h +++ b/include/linux/firmware/cirrus/cs_dsp.h @@ -232,8 +232,10 @@ void cs_dsp_init_debugfs(struct cs_dsp *dsp, struct dentry *debugfs_root); void cs_dsp_cleanup_debugfs(struct cs_dsp *dsp); int cs_dsp_coeff_write_acked_control(struct cs_dsp_coeff_ctl *ctl, unsigned int event_id); -int cs_dsp_coeff_write_ctrl(struct cs_dsp_coeff_ctl *ctl, const void *buf, size_t len); -int cs_dsp_coeff_read_ctrl(struct cs_dsp_coeff_ctl *ctl, void *buf, size_t len); +int cs_dsp_coeff_write_ctrl(struct cs_dsp_coeff_ctl *ctl, unsigned int off, + const void *buf, size_t len); +int cs_dsp_coeff_read_ctrl(struct cs_dsp_coeff_ctl *ctl, unsigned int off, + void *buf, size_t len); struct cs_dsp_coeff_ctl *cs_dsp_get_ctl(struct cs_dsp *dsp, const char *name, int type, unsigned int alg); -- cgit v1.2.3 From 5c903f64ce97172d63f7591cfa9e37cba58867b2 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Wed, 17 Nov 2021 13:23:00 +0000 Subject: firmware: cs_dsp: Allow creation of event controls Some firmwares contain controls intended to convey firmware state back to the host. Whilst more infrastructure will probably be needed for these in time, as a first step allow creation of the controls, so said firmwares arn't completely rejected. Signed-off-by: Charles Keepax Link: https://lore.kernel.org/r/20211117132300.1290-10-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/wmfw.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/wmfw.h b/include/linux/firmware/cirrus/wmfw.h index a19bf7c6fc8b..74e5a4f6c13a 100644 --- a/include/linux/firmware/cirrus/wmfw.h +++ b/include/linux/firmware/cirrus/wmfw.h @@ -29,6 +29,7 @@ #define WMFW_CTL_TYPE_ACKED 0x1000 /* acked control */ #define WMFW_CTL_TYPE_HOSTEVENT 0x1001 /* event control */ #define WMFW_CTL_TYPE_HOST_BUFFER 0x1002 /* host buffer pointer */ +#define WMFW_CTL_TYPE_FWEVENT 0x1004 /* firmware event control */ struct wmfw_header { char magic[4]; -- cgit v1.2.3 From 357a18ad230f0867791b788d2b1d6f280f6f6e61 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 15 Nov 2021 16:50:27 +0000 Subject: KVM: Kill kvm_map_gfn() / kvm_unmap_gfn() and gfn_to_pfn_cache In commit 7e2175ebd695 ("KVM: x86: Fix recording of guest steal time / preempted status") I removed the only user of these functions because it was basically impossible to use them safely. There are two stages to the GFN->PFN mapping; first through the KVM memslots to a userspace HVA and then through the page tables to translate that HVA to an underlying PFN. Invalidations of the former were being handled correctly, but no attempt was made to use the MMU notifiers to invalidate the cache when the HVA->GFN mapping changed. As a prelude to reinventing the gfn_to_pfn_cache with more usable semantics, rip it out entirely and untangle the implementation of the unsafe kvm_vcpu_map()/kvm_vcpu_unmap() functions from it. All current users of kvm_vcpu_map() also look broken right now, and will be dealt with separately. They broadly fall into two classes: * Those which map, access the data and immediately unmap. This is mostly gratuitous and could just as well use the existing user HVA, and could probably benefit from a gfn_to_hva_cache as they do so. * Those which keep the mapping around for a longer time, perhaps even using the PFN directly from the guest. These will need to be converted to the new gfn_to_pfn_cache and then kvm_vcpu_map() can be removed too. Signed-off-by: David Woodhouse Message-Id: <20211115165030.7422-8-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 6 +----- include/linux/kvm_types.h | 7 ------- 2 files changed, 1 insertion(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 60a35d9fe259..eb625af4fc5e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -866,7 +866,7 @@ void kvm_release_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_accessed(kvm_pfn_t pfn); -void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache); +void kvm_release_pfn(kvm_pfn_t pfn, bool dirty); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); @@ -942,12 +942,8 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map); -int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, - struct gfn_to_pfn_cache *cache, bool atomic); struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); -int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, - struct gfn_to_pfn_cache *cache, bool dirty, bool atomic); unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 2237abb93ccd..234eab059839 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -53,13 +53,6 @@ struct gfn_to_hva_cache { struct kvm_memory_slot *memslot; }; -struct gfn_to_pfn_cache { - u64 generation; - gfn_t gfn; - kvm_pfn_t pfn; - bool dirty; -}; - #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE /* * Memory caches are used to preallocate memory ahead of various MMU flows, -- cgit v1.2.3 From f915b75bffb7257bd8d26376b8e1cc67771927f8 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 17 Nov 2021 15:56:52 +0800 Subject: page_pool: Revert "page_pool: disable dma mapping support..." This reverts commit d00e60ee54b12de945b8493cf18c1ada9e422514. As reported by Guillaume in [1]: Enabling LPAE always enables CONFIG_ARCH_DMA_ADDR_T_64BIT in 32-bit systems, which breaks the bootup proceess when a ethernet driver is using page pool with PP_FLAG_DMA_MAP flag. As we were hoping we had no active consumers for such system when we removed the dma mapping support, and LPAE seems like a common feature for 32 bits system, so revert it. 1. https://www.spinics.net/lists/netdev/msg779890.html Fixes: d00e60ee54b1 ("page_pool: disable dma mapping support for 32-bit arch with 64-bit DMA") Signed-off-by: Yunsheng Lin Reported-by: "kernelci.org bot" Tested-by: "kernelci.org bot" Acked-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/linux/mm_types.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bb8c6f5f19bc..c3a6e6209600 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -105,7 +105,18 @@ struct page { struct page_pool *pp; unsigned long _pp_mapping_pad; unsigned long dma_addr; - atomic_long_t pp_frag_count; + union { + /** + * dma_addr_upper: might require a 64-bit + * value on 32-bit architectures. + */ + unsigned long dma_addr_upper; + /** + * For frag page support, not supported in + * 32-bit architectures with 64-bit DMA. + */ + atomic_long_t pp_frag_count; + }; }; struct { /* slab, slob and slub */ union { -- cgit v1.2.3 From df6160deb3debe6f964c16349f9431157ff67dda Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Nov 2021 17:57:29 -0800 Subject: tcp: add missing htmldocs for skb->ll_node and sk->defer_list Add missing entries to fix these "make htmldocs" warnings. ./include/linux/skbuff.h:953: warning: Function parameter or member 'll_node' not described in 'sk_buff' ./include/net/sock.h:540: warning: Function parameter or member 'defer_list' not described in 'sock' Fixes: f35f821935d8 ("tcp: defer skb freeing after socket lock is released") Signed-off-by: Eric Dumazet Reported-by: Stephen Rothwell Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b8b806512e16..100fd604fbc9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -627,6 +627,7 @@ typedef unsigned char *sk_buff_data_t; * for retransmit timer * @rbnode: RB tree node, alternative to next/prev for netem/tcp * @list: queue head + * @ll_node: anchor in an llist (eg socket defer_list) * @sk: Socket we are owned by * @ip_defrag_offset: (aka @sk) alternate use of @sk, used in * fragmentation management -- cgit v1.2.3 From 6966df483d7b5b218aeb0e13e7e334a8fc3c1744 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Thu, 18 Nov 2021 13:49:51 +0200 Subject: regulator: Update protection IRQ helper docs The documentation of IRQ notification helper had still references to first RFC implementation which called BUG() while trying to protect the hardware. Behaviour was improved as calling the BUG() was not a proper solution. Current implementation attempts to call poweroff if handling of potentially damaging error notification fails. Update the documentation to reflect the actual behaviour. Signed-off-by: Matti Vaittinen Link: https://lore.kernel.org/r/0c9cc4bcf20c3da66fd5a85c97ee4288e5727538.1637233864.git.matti.vaittinen@fi.rohmeurope.com Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index bd7a73db2e66..54cf566616ae 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -499,7 +499,8 @@ struct regulator_irq_data { * best to shut-down regulator(s) or reboot the SOC if error * handling is repeatedly failing. If fatal_cnt is given the IRQ * handling is aborted if it fails for fatal_cnt times and die() - * callback (if populated) or BUG() is called to try to prevent + * callback (if populated) is called. If die() is not populated + * poweroff for the system is attempted in order to prevent any * further damage. * @reread_ms: The time which is waited before attempting to re-read status * at the worker if IC reading fails. Immediate re-read is done @@ -516,11 +517,12 @@ struct regulator_irq_data { * @data: Driver private data pointer which will be passed as such to * the renable, map_event and die callbacks in regulator_irq_data. * @die: Protection callback. If IC status reading or recovery actions - * fail fatal_cnt times this callback or BUG() is called. This - * callback should implement a final protection attempt like - * disabling the regulator. If protection succeeded this may - * return 0. If anything else is returned the core assumes final - * protection failed and calls BUG() as a last resort. + * fail fatal_cnt times this callback is called or system is + * powered off. This callback should implement a final protection + * attempt like disabling the regulator. If protection succeeded + * die() may return 0. If anything else is returned the core + * assumes final protection failed and attempts to perform a + * poweroff as a last resort. * @map_event: Driver callback to map IRQ status into regulator devices with * events / errors. NOTE: callback MUST initialize both the * errors and notifs for all rdevs which it signals having -- cgit v1.2.3 From 8b6e88555971eac384b89fb0bd6c72ee4e1e6a6a Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Thu, 18 Nov 2021 13:48:47 +0200 Subject: regulator: rohm-regulator: add helper for restricted voltage setting Few ROHM PMICs have regulators where voltage setting can be done only when regulator is disabled. Add helper for those PMICs. Signed-off-by: Matti Vaittinen Link: https://lore.kernel.org/r/6f51871e9fea611d133b5dd2560f4a7ee1ede9cd.1637233864.git.matti.vaittinen@fi.rohmeurope.com Signed-off-by: Mark Brown --- include/linux/mfd/rohm-generic.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h index 35b392a0d73a..35c5866f48b7 100644 --- a/include/linux/mfd/rohm-generic.h +++ b/include/linux/mfd/rohm-generic.h @@ -80,6 +80,8 @@ int rohm_regulator_set_dvs_levels(const struct rohm_dvs_config *dvs, const struct regulator_desc *desc, struct regmap *regmap); +int rohm_regulator_set_voltage_sel_restricted(struct regulator_dev *rdev, + unsigned int sel); #else static inline int rohm_regulator_set_dvs_levels(const struct rohm_dvs_config *dvs, struct device_node *np, @@ -88,6 +90,11 @@ static inline int rohm_regulator_set_dvs_levels(const struct rohm_dvs_config *dv { return 0; } +static int rohm_regulator_set_voltage_sel_restricted(struct regulator_dev *rdev, + unsigned int sel) +{ + return 0; +} #endif #endif -- cgit v1.2.3 From 92b1348277f8893671e5354adde64fe3cf462821 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Thu, 18 Nov 2021 13:49:30 +0200 Subject: regulator: Add units to limit documentation The documentation for limits used at protection level setting did not mention the units. Fix the units in documentation to match values passed in from device-tree (uV, uA, Kelvin) to avoid confusion. Signed-off-by: Matti Vaittinen Link: https://lore.kernel.org/r/111114aca991e41e49a32f89b74e95285f07c1e3.1637233864.git.matti.vaittinen@fi.rohmeurope.com Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index bd7a73db2e66..1cb8071fee34 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -101,11 +101,13 @@ enum regulator_detection_severity { * is requested. * @set_over_voltage_protection: Support enabling of and setting limits for over * voltage situation detection. Detection can be configured for same - * severities as over current protection. + * severities as over current protection. Units of uV. * @set_under_voltage_protection: Support enabling of and setting limits for - * under situation detection. + * under voltage situation detection. Detection can be configured for same + * severities as over current protection. Units of uV. * @set_thermal_protection: Support enabling of and setting limits for over - * temperature situation detection. + * temperature situation detection.Detection can be configured for same + * severities as over current protection. Units of degree Kelvin. * * @set_active_discharge: Set active discharge enable/disable of regulators. * -- cgit v1.2.3 From 418e0a3551bbef5b221705b0e5b8412cdc0afd39 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 5 Nov 2021 14:42:24 +0200 Subject: lib/string_helpers: Introduce kasprintf_strarray() We have a few users already that basically want to have array of sequential strings to be allocated and filled. Provide a helper for them (basically adjusted version from gpio-mockup.c). Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij --- include/linux/string_helpers.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h index 4ba39e1403b2..f67a94013c87 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -100,6 +100,7 @@ char *kstrdup_quotable(const char *src, gfp_t gfp); char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp); char *kstrdup_quotable_file(struct file *file, gfp_t gfp); +char **kasprintf_strarray(gfp_t gfp, const char *prefix, size_t n); void kfree_strarray(char **array, size_t n); #endif -- cgit v1.2.3 From acdb89b6c87a2d7b5c48a82756e6f5c6f599f60a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 5 Nov 2021 14:42:25 +0200 Subject: lib/string_helpers: Introduce managed variant of kasprintf_strarray() Some of the users want to have easy way to allocate array of strings that will be automatically cleaned when associated device is gone. Introduce managed variant of kasprintf_strarray() for such use cases. Signed-off-by: Andy Shevchenko --- include/linux/string_helpers.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h index f67a94013c87..7a22921c9db7 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -7,6 +7,7 @@ #include #include +struct device; struct file; struct task_struct; @@ -103,4 +104,6 @@ char *kstrdup_quotable_file(struct file *file, gfp_t gfp); char **kasprintf_strarray(gfp_t gfp, const char *prefix, size_t n); void kfree_strarray(char **array, size_t n); +char **devm_kasprintf_strarray(struct device *dev, const char *prefix, size_t n); + #endif -- cgit v1.2.3 From 57bdeef4716689d9b0e3571034d65cf420f6efcd Mon Sep 17 00:00:00 2001 From: Naveen Naidu Date: Thu, 18 Nov 2021 19:33:11 +0530 Subject: PCI: Add PCI_ERROR_RESPONSE and related definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A config or MMIO read from a PCI device that doesn't exist or doesn't respond causes a PCI error. There's no real data to return to satisfy the CPU read, so most hardware fabricates ~0 data. Add a PCI_ERROR_RESPONSE definition for that and use it where appropriate to make these checks consistent and easier to find. Also add helper definitions PCI_SET_ERROR_RESPONSE() and PCI_POSSIBLE_ERROR() to make the code more readable. Suggested-by: Bjorn Helgaas Link: https://lore.kernel.org/r/55563bf4dfc5d3fdc96695373c659d099bf175b1.1637243717.git.naveennaidu479@gmail.com Signed-off-by: Naveen Naidu Signed-off-by: Bjorn Helgaas Reviewed-by: Pali Rohár --- include/linux/pci.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 18a75c8e615c..0ce26850470e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -154,6 +154,15 @@ enum pci_interrupt_pin { /* The number of legacy PCI INTx interrupts */ #define PCI_NUM_INTX 4 +/* + * Reading from a device that doesn't respond typically returns ~0. A + * successful read from a device may also return ~0, so you need additional + * information to reliably identify errors. + */ +#define PCI_ERROR_RESPONSE (~0ULL) +#define PCI_SET_ERROR_RESPONSE(val) (*(val) = ((typeof(*(val))) PCI_ERROR_RESPONSE)) +#define PCI_POSSIBLE_ERROR(val) ((val) == ((typeof(val)) PCI_ERROR_RESPONSE)) + /* * pci_power_t values must match the bits in the Capabilities PME_Support * and Control/Status PowerState fields in the Power Management capability. -- cgit v1.2.3 From c035713998700e8843c7d087f55bce3c54c0e3ec Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 5 Nov 2021 10:19:05 -0400 Subject: mm: Add functions to zero portions of a folio These functions are wrappers around zero_user_segments(), which means that zero_user_segments() can now be called for compound pages even when CONFIG_TRANSPARENT_HUGEPAGE is disabled. Use 'xend' as the name of the parameter to indicate that this is an excluded end, not the more usual included end. Excluding the end makes more sense to the callers, but can cause confusion to readers who are more used to seeing included ends. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- include/linux/highmem.h | 44 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index c944b3b70ee7..39bb9b47fa9c 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -230,10 +230,10 @@ static inline void tag_clear_highpage(struct page *page) * If we pass in a base or tail page, we can zero up to PAGE_SIZE. * If we pass in a head page, we can zero up to the size of the compound page. */ -#if defined(CONFIG_HIGHMEM) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +#ifdef CONFIG_HIGHMEM void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2); -#else /* !HIGHMEM || !TRANSPARENT_HUGEPAGE */ +#else static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2) @@ -253,7 +253,7 @@ static inline void zero_user_segments(struct page *page, for (i = 0; i < compound_nr(page); i++) flush_dcache_page(page + i); } -#endif /* !HIGHMEM || !TRANSPARENT_HUGEPAGE */ +#endif static inline void zero_user_segment(struct page *page, unsigned start, unsigned end) @@ -363,4 +363,42 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len) kunmap_local(addr); } +/** + * folio_zero_segments() - Zero two byte ranges in a folio. + * @folio: The folio to write to. + * @start1: The first byte to zero. + * @xend1: One more than the last byte in the first range. + * @start2: The first byte to zero in the second range. + * @xend2: One more than the last byte in the second range. + */ +static inline void folio_zero_segments(struct folio *folio, + size_t start1, size_t xend1, size_t start2, size_t xend2) +{ + zero_user_segments(&folio->page, start1, xend1, start2, xend2); +} + +/** + * folio_zero_segment() - Zero a byte range in a folio. + * @folio: The folio to write to. + * @start: The first byte to zero. + * @xend: One more than the last byte to zero. + */ +static inline void folio_zero_segment(struct folio *folio, + size_t start, size_t xend) +{ + zero_user_segments(&folio->page, start, xend, 0, 0); +} + +/** + * folio_zero_range() - Zero a byte range in a folio. + * @folio: The folio to write to. + * @start: The first byte to zero. + * @length: The number of bytes to zero. + */ +static inline void folio_zero_range(struct folio *folio, + size_t start, size_t length) +{ + zero_user_segments(&folio->page, start, start + length, 0, 0); +} + #endif /* _LINUX_HIGHMEM_H */ -- cgit v1.2.3 From c4f5b30dda01f2f6979a9681142de454991182ee Mon Sep 17 00:00:00 2001 From: Biju Das Date: Fri, 12 Nov 2021 18:44:10 +0000 Subject: reset: Add of_reset_control_get_optional_exclusive() Add optional variant of of_reset_control_get_exclusive(). If the requested reset is not specified in the device tree, this function returns NULL instead of an error. Suggested-by: Philipp Zabel Signed-off-by: Biju Das Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20211112184413.4391-2-biju.das.jz@bp.renesas.com Signed-off-by: Philipp Zabel --- include/linux/reset.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/reset.h b/include/linux/reset.h index db0e6115a2f6..8a21b5756c3e 100644 --- a/include/linux/reset.h +++ b/include/linux/reset.h @@ -454,6 +454,26 @@ static inline struct reset_control *of_reset_control_get_exclusive( return __of_reset_control_get(node, id, 0, false, false, true); } +/** + * of_reset_control_get_optional_exclusive - Lookup and obtain an optional exclusive + * reference to a reset controller. + * @node: device to be reset by the controller + * @id: reset line name + * + * Optional variant of of_reset_control_get_exclusive(). If the requested reset + * is not specified in the device tree, this function returns NULL instead of + * an error. + * + * Returns a struct reset_control or IS_ERR() condition containing errno. + * + * Use of id names is optional. + */ +static inline struct reset_control *of_reset_control_get_optional_exclusive( + struct device_node *node, const char *id) +{ + return __of_reset_control_get(node, id, 0, false, true, true); +} + /** * of_reset_control_get_shared - Lookup and obtain a shared reference * to a reset controller. -- cgit v1.2.3 From fcb116bc43c8c37c052530ead79872f8b2615711 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 18 Nov 2021 14:23:21 -0600 Subject: signal: Replace force_fatal_sig with force_exit_sig when in doubt Recently to prevent issues with SECCOMP_RET_KILL and similar signals being changed before they are delivered SA_IMMUTABLE was added. Unfortunately this broke debuggers[1][2] which reasonably expect to be able to trap synchronous SIGTRAP and SIGSEGV even when the target process is not configured to handle those signals. Add force_exit_sig and use it instead of force_fatal_sig where historically the code has directly called do_exit. This has the implementation benefits of going through the signal exit path (including generating core dumps) without the danger of allowing userspace to ignore or change these signals. This avoids userspace regressions as older kernels exited with do_exit which debuggers also can not intercept. In the future is should be possible to improve the quality of implementation of the kernel by changing some of these force_exit_sig calls to force_fatal_sig. That can be done where it matters on a case-by-case basis with careful analysis. Reported-by: Kyle Huey Reported-by: kernel test robot [1] https://lkml.kernel.org/r/CAP045AoMY4xf8aC_4QU_-j7obuEPYgTcnQQP3Yxk=2X90jtpjw@mail.gmail.com [2] https://lkml.kernel.org/r/20211117150258.GB5403@xsang-OptiPlex-9020 Fixes: 00b06da29cf9 ("signal: Add SA_IMMUTABLE to ensure forced siganls do not get changed") Fixes: a3616a3c0272 ("signal/m68k: Use force_sigsegv(SIGSEGV) in fpsp040_die") Fixes: 83a1f27ad773 ("signal/powerpc: On swapcontext failure force SIGSEGV") Fixes: 9bc508cf0791 ("signal/s390: Use force_sigsegv in default_trap_handler") Fixes: 086ec444f866 ("signal/sparc32: In setup_rt_frame and setup_fram use force_fatal_sig") Fixes: c317d306d550 ("signal/sparc32: Exit with a fatal signal when try_to_clear_window_buffer fails") Fixes: 695dd0d634df ("signal/x86: In emulate_vsyscall force a signal instead of calling do_exit") Fixes: 1fbd60df8a85 ("signal/vm86_32: Properly send SIGSEGV when the vm86 state cannot be saved.") Fixes: 941edc5bf174 ("exit/syscall_user_dispatch: Send ordinary signals on failure") Link: https://lkml.kernel.org/r/871r3dqfv8.fsf_-_@email.froward.int.ebiederm.org Reviewed-by: Kees Cook Tested-by: Kees Cook Tested-by: Kyle Huey Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 23505394ef70..33a50642cf41 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -352,6 +352,7 @@ extern __must_check bool do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int); extern void force_fatal_sig(int); +extern void force_exit_sig(int); extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); -- cgit v1.2.3 From d8466f73010faf71effb21228ae1cbf577dab130 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Sat, 16 Oct 2021 14:22:27 +0100 Subject: mtd: rawnand: Export nand_read_page_hwecc_oob_first() Move the function nand_read_page_hwecc_oob_first() (previously nand_davinci_read_page_hwecc_oob_first()) to nand_base.c, and export it as a GPL symbol, so that it can be used by more modules. Cc: # v5.2 Fixes: a0ac778eb82c ("mtd: rawnand: ingenic: Add support for the JZ4740") Signed-off-by: Paul Cercueil Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20211016132228.40254-4-paul@crapouillou.net --- include/linux/mtd/rawnand.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index b2f9dd3cbd69..5b88cd51fadb 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -1539,6 +1539,8 @@ int nand_read_data_op(struct nand_chip *chip, void *buf, unsigned int len, bool force_8bit, bool check_only); int nand_write_data_op(struct nand_chip *chip, const void *buf, unsigned int len, bool force_8bit); +int nand_read_page_hwecc_oob_first(struct nand_chip *chip, uint8_t *buf, + int oob_required, int page); /* Scan and identify a NAND device */ int nand_scan_with_ids(struct nand_chip *chip, unsigned int max_chips, -- cgit v1.2.3 From adeef3e32146a8d2a73c399dc6f5d76a449131b1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Nov 2021 06:21:51 -0800 Subject: net: constify netdev->dev_addr Commit 406f42fa0d3c ("net-next: When a bond have a massive amount of VLANs...") introduced a rbtree for faster Ethernet address look up. We converted all users to make modifications via appropriate helpers, make netdev->dev_addr const. The update helpers need to upcast from the buffer to struct netdev_hw_addr. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4f4a299e92de..2462195784a9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2117,7 +2117,7 @@ struct net_device { * Cache lines mostly used on receive path (including eth_type_trans()) */ /* Interface address info used in eth_type_trans() */ - unsigned char *dev_addr; + const unsigned char *dev_addr; struct netdev_rx_queue *_rx; unsigned int num_rx_queues; @@ -4268,10 +4268,13 @@ void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, void __hw_addr_init(struct netdev_hw_addr_list *list); /* Functions used for device addresses handling */ +void dev_addr_mod(struct net_device *dev, unsigned int offset, + const void *addr, size_t len); + static inline void __dev_addr_set(struct net_device *dev, const void *addr, size_t len) { - memcpy(dev->dev_addr, addr, len); + dev_addr_mod(dev, 0, addr, len); } static inline void dev_addr_set(struct net_device *dev, const u8 *addr) @@ -4279,13 +4282,6 @@ static inline void dev_addr_set(struct net_device *dev, const u8 *addr) __dev_addr_set(dev, addr, dev->addr_len); } -static inline void -dev_addr_mod(struct net_device *dev, unsigned int offset, - const void *addr, size_t len) -{ - memcpy(&dev->dev_addr[offset], addr, len); -} - int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); int dev_addr_del(struct net_device *dev, const unsigned char *addr, -- cgit v1.2.3 From d07b26f5bbea9ade34dfd6abea7b3ca056c03cd1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Nov 2021 06:21:53 -0800 Subject: dev_addr: add a modification check netdev->dev_addr should only be modified via helpers, but someone may be casting off the const. Add a runtime check to catch abuses. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2462195784a9..cb7f2661d187 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1942,6 +1942,8 @@ enum netdev_ml_priv_type { * @unlink_list: As netif_addr_lock() can be called recursively, * keep a list of interfaces to be deleted. * + * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2268,6 +2270,8 @@ struct net_device { /* protected by rtnl_lock */ struct bpf_xdp_entity xdp_state[__MAX_XDP_MODE]; + + u8 dev_addr_shadow[MAX_ADDR_LEN]; }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -4288,6 +4292,7 @@ int dev_addr_del(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); void dev_addr_flush(struct net_device *dev); int dev_addr_init(struct net_device *dev); +void dev_addr_check(struct net_device *dev); /* Functions used for unicast addresses handling */ int dev_uc_add(struct net_device *dev, const unsigned char *addr); -- cgit v1.2.3 From 85b6d24646e4125c591639841169baa98a2da503 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 19 Nov 2021 16:43:21 -0800 Subject: shm: extend forced shm destroy to support objects from several IPC nses Currently, the exit_shm() function not designed to work properly when task->sysvshm.shm_clist holds shm objects from different IPC namespaces. This is a real pain when sysctl kernel.shm_rmid_forced = 1, because it leads to use-after-free (reproducer exists). This is an attempt to fix the problem by extending exit_shm mechanism to handle shm's destroy from several IPC ns'es. To achieve that we do several things: 1. add a namespace (non-refcounted) pointer to the struct shmid_kernel 2. during new shm object creation (newseg()/shmget syscall) we initialize this pointer by current task IPC ns 3. exit_shm() fully reworked such that it traverses over all shp's in task->sysvshm.shm_clist and gets IPC namespace not from current task as it was before but from shp's object itself, then call shm_destroy(shp, ns). Note: We need to be really careful here, because as it was said before (1), our pointer to IPC ns non-refcnt'ed. To be on the safe side we using special helper get_ipc_ns_not_zero() which allows to get IPC ns refcounter only if IPC ns not in the "state of destruction". Q/A Q: Why can we access shp->ns memory using non-refcounted pointer? A: Because shp object lifetime is always shorther than IPC namespace lifetime, so, if we get shp object from the task->sysvshm.shm_clist while holding task_lock(task) nobody can steal our namespace. Q: Does this patch change semantics of unshare/setns/clone syscalls? A: No. It's just fixes non-covered case when process may leave IPC namespace without getting task->sysvshm.shm_clist list cleaned up. Link: https://lkml.kernel.org/r/67bb03e5-f79c-1815-e2bf-949c67047418@colorfullife.com Link: https://lkml.kernel.org/r/20211109151501.4921-1-manfred@colorfullife.com Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity") Co-developed-by: Manfred Spraul Signed-off-by: Manfred Spraul Signed-off-by: Alexander Mikhalitsyn Cc: "Eric W. Biederman" Cc: Davidlohr Bueso Cc: Greg KH Cc: Andrei Vagin Cc: Pavel Tikhomirov Cc: Vasily Averin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 15 +++++++++++++++ include/linux/sched/task.h | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 05e22770af51..b75395ec8d52 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -131,6 +131,16 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) return ns; } +static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) +{ + if (ns) { + if (refcount_inc_not_zero(&ns->ns.count)) + return ns; + } + + return NULL; +} + extern void put_ipc_ns(struct ipc_namespace *ns); #else static inline struct ipc_namespace *copy_ipcs(unsigned long flags, @@ -147,6 +157,11 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) return ns; } +static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) +{ + return ns; +} + static inline void put_ipc_ns(struct ipc_namespace *ns) { } diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ba88a6987400..058d7f371e25 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -158,7 +158,7 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also * pins the final release of task.io_context. Also protects ->cpuset and - * ->cgroup.subsys[]. And ->vfork_done. + * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), -- cgit v1.2.3 From afe041c2d0febd83698b8b0164e6b3b1dfae0b66 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Fri, 19 Nov 2021 16:43:40 -0800 Subject: hugetlb: fix hugetlb cgroup refcounting during mremap When hugetlb_vm_op_open() is called during copy_vma(), we may take the reference to resv_map->css. Later, when clearing the reservation pointer of old_vma after transferring it to new_vma, we forget to drop the reference to resv_map->css. This leads to a reference leak of css. Fixes this by adding a check to drop reservation css reference in clear_vma_resv_huge_pages() Link: https://lkml.kernel.org/r/20211113154412.91134-1-minhquangbui99@gmail.com Fixes: 550a7d60bd5e35 ("mm, hugepages: add mremap() support for hugepage backed vma") Signed-off-by: Bui Quang Minh Reviewed-by: Mike Kravetz Reviewed-by: Mina Almasry Cc: Miaohe Lin Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb_cgroup.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index c137396129db..ba025ae27882 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -128,6 +128,13 @@ static inline void resv_map_dup_hugetlb_cgroup_uncharge_info( css_get(resv_map->css); } +static inline void resv_map_put_hugetlb_cgroup_uncharge_info( + struct resv_map *resv_map) +{ + if (resv_map->css) + css_put(resv_map->css); +} + extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr); extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, @@ -211,6 +218,11 @@ static inline void resv_map_dup_hugetlb_cgroup_uncharge_info( { } +static inline void resv_map_put_hugetlb_cgroup_uncharge_info( + struct resv_map *resv_map) +{ +} + static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { -- cgit v1.2.3 From f65b8132092699e4f672111836f3f51c00c354f2 Mon Sep 17 00:00:00 2001 From: Elyes HAOUAS Date: Thu, 28 Oct 2021 23:05:17 +0200 Subject: include/linux/efi.h: Remove unneeded whitespaces before tabs Signed-off-by: Elyes HAOUAS Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index dbd39b20e034..de36fb547602 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -570,8 +570,8 @@ extern struct efi { unsigned long flags; } efi; -#define EFI_RT_SUPPORTED_GET_TIME 0x0001 -#define EFI_RT_SUPPORTED_SET_TIME 0x0002 +#define EFI_RT_SUPPORTED_GET_TIME 0x0001 +#define EFI_RT_SUPPORTED_SET_TIME 0x0002 #define EFI_RT_SUPPORTED_GET_WAKEUP_TIME 0x0004 #define EFI_RT_SUPPORTED_SET_WAKEUP_TIME 0x0008 #define EFI_RT_SUPPORTED_GET_VARIABLE 0x0010 @@ -838,7 +838,7 @@ extern int efi_status_to_err(efi_status_t status); #define EFI_VARIABLE_TIME_BASED_AUTHENTICATED_WRITE_ACCESS 0x0000000000000020 #define EFI_VARIABLE_APPEND_WRITE 0x0000000000000040 -#define EFI_VARIABLE_MASK (EFI_VARIABLE_NON_VOLATILE | \ +#define EFI_VARIABLE_MASK (EFI_VARIABLE_NON_VOLATILE | \ EFI_VARIABLE_BOOTSERVICE_ACCESS | \ EFI_VARIABLE_RUNTIME_ACCESS | \ EFI_VARIABLE_HARDWARE_ERROR_RECORD | \ -- cgit v1.2.3 From 3218910fd5858842a1dd98ce92b602f0878f8210 Mon Sep 17 00:00:00 2001 From: Adrian Larumbe Date: Mon, 1 Nov 2021 18:08:24 +0000 Subject: dmaengine: Add core function and capability check for DMA_MEMCPY_SG This is the old DMA_SG interface that was removed in commit c678fa66341c ("dmaengine: remove DMA_SG as it is dead code in kernel"). It has been renamed to DMA_MEMCPY_SG to better match the MEMSET and MEMSET_SG naming convention. It should only be used for mem2mem copies, either main system memory or CPU-addressable device memory (like video memory on a PCI graphics card). Bringing back this interface was prompted by the need to use the Xilinx CDMA device for mem2mem SG transfers. Signed-off-by: Adrian Larumbe Link: https://lore.kernel.org/r/20211101180825.241048-3-adrianml@alumnos.upm.es Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 9000f3ffce8b..554a86665de9 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -50,6 +50,7 @@ enum dma_status { */ enum dma_transaction_type { DMA_MEMCPY, + DMA_MEMCPY_SG, DMA_XOR, DMA_PQ, DMA_XOR_VAL, @@ -891,6 +892,11 @@ struct dma_device { struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)( struct dma_chan *chan, dma_addr_t dst, dma_addr_t src, size_t len, unsigned long flags); + struct dma_async_tx_descriptor *(*device_prep_dma_memcpy_sg)( + struct dma_chan *chan, + struct scatterlist *dst_sg, unsigned int dst_nents, + struct scatterlist *src_sg, unsigned int src_nents, + unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_xor)( struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src, unsigned int src_cnt, size_t len, unsigned long flags); @@ -1051,6 +1057,20 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memcpy( len, flags); } +static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memcpy_sg( + struct dma_chan *chan, + struct scatterlist *dst_sg, unsigned int dst_nents, + struct scatterlist *src_sg, unsigned int src_nents, + unsigned long flags) +{ + if (!chan || !chan->device || !chan->device->device_prep_dma_memcpy_sg) + return NULL; + + return chan->device->device_prep_dma_memcpy_sg(chan, dst_sg, dst_nents, + src_sg, src_nents, + flags); +} + static inline bool dmaengine_is_metadata_mode_supported(struct dma_chan *chan, enum dma_desc_metadata_mode mode) { -- cgit v1.2.3 From 0b70c256eba8448b072d25c95ee65e59da8970de Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 18 Nov 2021 20:12:42 +0800 Subject: ethtool: add support to set/get rx buf len via ethtool Add support to set rx buf len via ethtool -G parameter and get rx buf len via ethtool -g parameter. Signed-off-by: Hao Chen Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- include/linux/ethtool.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 845a0ffc16ee..0b252b82988b 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -67,6 +67,22 @@ enum { ETH_RSS_HASH_FUNCS_COUNT }; +/** + * struct kernel_ethtool_ringparam - RX/TX ring configuration + * @rx_buf_len: Current length of buffers on the rx ring. + */ +struct kernel_ethtool_ringparam { + u32 rx_buf_len; +}; + +/** + * enum ethtool_supported_ring_param - indicator caps for setting ring params + * @ETHTOOL_RING_USE_RX_BUF_LEN: capture for setting rx_buf_len + */ +enum ethtool_supported_ring_param { + ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0), +}; + #define __ETH_RSS_HASH_BIT(bit) ((u32)1 << (bit)) #define __ETH_RSS_HASH(name) __ETH_RSS_HASH_BIT(ETH_RSS_HASH_##name##_BIT) @@ -432,6 +448,7 @@ struct ethtool_module_power_mode_params { * @cap_link_lanes_supported: indicates if the driver supports lanes * parameter. * @supported_coalesce_params: supported types of interrupt coalescing. + * @supported_ring_params: supported ring params. * @get_drvinfo: Report driver/device information. Should only set the * @driver, @version, @fw_version and @bus_info fields. If not * implemented, the @driver and @bus_info fields will be filled in @@ -613,6 +630,7 @@ struct ethtool_module_power_mode_params { struct ethtool_ops { u32 cap_link_lanes_supported:1; u32 supported_coalesce_params; + u32 supported_ring_params; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); int (*get_regs_len)(struct net_device *); void (*get_regs)(struct net_device *, struct ethtool_regs *, void *); -- cgit v1.2.3 From 7462494408cd3de8b0bc1e79670bf213288501d0 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 18 Nov 2021 20:12:43 +0800 Subject: ethtool: extend ringparam setting/getting API with rx_buf_len Add two new parameters kernel_ringparam and extack for .get_ringparam and .set_ringparam to extend more ring params through netlink. Signed-off-by: Hao Chen Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- include/linux/ethtool.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 0b252b82988b..a26f37a27167 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -656,9 +656,13 @@ struct ethtool_ops { struct kernel_ethtool_coalesce *, struct netlink_ext_ack *); void (*get_ringparam)(struct net_device *, - struct ethtool_ringparam *); + struct ethtool_ringparam *, + struct kernel_ethtool_ringparam *, + struct netlink_ext_ack *); int (*set_ringparam)(struct net_device *, - struct ethtool_ringparam *); + struct ethtool_ringparam *, + struct kernel_ethtool_ringparam *, + struct netlink_ext_ack *); void (*get_pause_stats)(struct net_device *dev, struct ethtool_pause_stats *pause_stats); void (*get_pauseparam)(struct net_device *, -- cgit v1.2.3 From 4b66d2161b8125b6caa6971815e85631cf3cf36f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Nov 2021 07:43:31 -0800 Subject: net: annotate accesses to dev->gso_max_size dev->gso_max_size is written under RTNL protection, or when the device is not yet visible, but is read locklessly. Add the READ_ONCE()/WRITE_ONCE() pairs, and use netif_set_gso_max_size() where we can to better document what is going on. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cb7f2661d187..14eeb58ed197 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4731,7 +4731,8 @@ static inline bool netif_needs_gso(struct sk_buff *skb, static inline void netif_set_gso_max_size(struct net_device *dev, unsigned int size) { - dev->gso_max_size = size; + /* dev->gso_max_size is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_size, size); } static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, -- cgit v1.2.3 From 6d872df3e3b91532b142de9044e5b4984017a55f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Nov 2021 07:43:32 -0800 Subject: net: annotate accesses to dev->gso_max_segs dev->gso_max_segs is written under RTNL protection, or when the device is not yet visible, but is read locklessly. Add netif_set_gso_max_segs() helper. Add the READ_ONCE()/WRITE_ONCE() pairs, and use netif_set_gso_max_segs() where we can to better document what is going on. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 14eeb58ed197..df049864661d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4735,6 +4735,13 @@ static inline void netif_set_gso_max_size(struct net_device *dev, WRITE_ONCE(dev->gso_max_size, size); } +static inline void netif_set_gso_max_segs(struct net_device *dev, + unsigned int segs) +{ + /* dev->gso_max_segs is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_segs, segs); +} + static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) -- cgit v1.2.3 From 291dcae39bc482f7edc91a50d97027327e0cf5f9 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Fri, 19 Nov 2021 10:58:09 -0500 Subject: net: phylink: Add helpers for c22 registers without MDIO Some devices expose memory-mapped c22-compliant PHYs. Because these devices do not have an MDIO bus, we cannot use the existing helpers. Refactor the existing helpers to allow supplying the values for c22 registers directly, instead of using MDIO to access them. Only get_state and set_advertisement are converted, since they contain the most complex logic. Because set_advertisement is never actually used outside phylink_mii_c22_pcs_config, move the MDIO-writing part into that function. Because some modes do not need the advertisement register set at all, we use -EINVAL for this purpose. Additionally, a new function phylink_pcs_enable_an is provided to determine whether to enable autonegotiation. Signed-off-by: Sean Anderson Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phylink.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 3563820a1765..01224235df0f 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -527,11 +527,12 @@ void phylink_set_port_modes(unsigned long *bits); void phylink_set_10g_modes(unsigned long *mask); void phylink_helper_basex_speed(struct phylink_link_state *state); +void phylink_mii_c22_pcs_decode_state(struct phylink_link_state *state, + u16 bmsr, u16 lpa); void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs, struct phylink_link_state *state); -int phylink_mii_c22_pcs_set_advertisement(struct mdio_device *pcs, - phy_interface_t interface, - const unsigned long *advertising); +int phylink_mii_c22_pcs_encode_advertisement(phy_interface_t interface, + const unsigned long *advertising); int phylink_mii_c22_pcs_config(struct mdio_device *pcs, unsigned int mode, phy_interface_t interface, const unsigned long *advertising); -- cgit v1.2.3 From c4804670026b93f4ebddda30af89fd737bf93931 Mon Sep 17 00:00:00 2001 From: M Chetan Kumar Date: Sat, 20 Nov 2021 21:51:54 +0530 Subject: net: wwan: common debugfs base dir for wwan device This patch set brings in a common debugfs base directory i.e. /sys/kernel/debugfs/wwan/ in WWAN Subsystem for a WWAN device instance. So that it avoids driver polluting debugfs root with unrelated directories & possible name collusion. Having a common debugfs base directory for WWAN drivers eases user to match control devices with debugfs entries. WWAN Subsystem creates dentry (/sys/kernel/debugfs/wwan) on module load & removes dentry on module unload. When driver registers a new wwan device, dentry (wwanX) is created for WWAN device instance & on driver unregister dentry is removed. New API is introduced to return the wwan device instance dentry so that driver can create debugfs entries under it. Signed-off-by: M Chetan Kumar Reviewed-by: Loic Poulain Signed-off-by: David S. Miller --- include/linux/wwan.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wwan.h b/include/linux/wwan.h index 9fac819f92e3..1646aa3e6779 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -171,4 +171,6 @@ int wwan_register_ops(struct device *parent, const struct wwan_ops *ops, void wwan_unregister_ops(struct device *parent); +struct dentry *wwan_get_debugfs_dir(struct device *parent); + #endif /* __WWAN_H */ -- cgit v1.2.3 From fba84957e2e2e201cf4e352efe0c7cac0fbb5d5d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 20 Nov 2021 16:31:48 -0800 Subject: skbuff: Move conditional preprocessor directives out of struct sk_buff In preparation for using the struct_group() macro in struct sk_buff, move the conditional preprocessor directives out of the region of struct sk_buff that will be enclosed by struct_group(). While GCC and Clang are happy with conditional preprocessor directives here, sparse is not, even under -Wno-directive-within-macro[1], as would be seen under a C=1 build: net/core/filter.c: note: in included file (through include/linux/netlink.h, include/linux/sock_diag.h): ./include/linux/skbuff.h:820:1: warning: directive in macro's argument list ./include/linux/skbuff.h:822:1: warning: directive in macro's argument list ./include/linux/skbuff.h:846:1: warning: directive in macro's argument list ./include/linux/skbuff.h:848:1: warning: directive in macro's argument list Additionally remove empty macro argument definitions and usage. "objdump -d" shows no object code differences. [1] https://www.spinics.net/lists/linux-sparse/msg10857.html Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- include/linux/skbuff.h | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 059b6266dcd7..c7889afe0d0d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -795,7 +795,7 @@ struct sk_buff { #else #define CLONED_MASK 1 #endif -#define CLONED_OFFSET() offsetof(struct sk_buff, __cloned_offset) +#define CLONED_OFFSET offsetof(struct sk_buff, __cloned_offset) /* private: */ __u8 __cloned_offset[0]; @@ -818,18 +818,10 @@ struct sk_buff { __u32 headers_start[0]; /* public: */ -/* if you move pkt_type around you also must adapt those constants */ -#ifdef __BIG_ENDIAN_BITFIELD -#define PKT_TYPE_MAX (7 << 5) -#else -#define PKT_TYPE_MAX 7 -#endif -#define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset) - /* private: */ __u8 __pkt_type_offset[0]; /* public: */ - __u8 pkt_type:3; + __u8 pkt_type:3; /* see PKT_TYPE_MAX */ __u8 ignore_df:1; __u8 nf_trace:1; __u8 ip_summed:2; @@ -845,16 +837,10 @@ struct sk_buff { __u8 encap_hdr_csum:1; __u8 csum_valid:1; -#ifdef __BIG_ENDIAN_BITFIELD -#define PKT_VLAN_PRESENT_BIT 7 -#else -#define PKT_VLAN_PRESENT_BIT 0 -#endif -#define PKT_VLAN_PRESENT_OFFSET() offsetof(struct sk_buff, __pkt_vlan_present_offset) /* private: */ __u8 __pkt_vlan_present_offset[0]; /* public: */ - __u8 vlan_present:1; + __u8 vlan_present:1; /* See PKT_VLAN_PRESENT_BIT */ __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 csum_not_inet:1; @@ -953,6 +939,22 @@ struct sk_buff { #endif }; +/* if you move pkt_type around you also must adapt those constants */ +#ifdef __BIG_ENDIAN_BITFIELD +#define PKT_TYPE_MAX (7 << 5) +#else +#define PKT_TYPE_MAX 7 +#endif +#define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset) + +/* if you move pkt_vlan_present around you also must adapt these constants */ +#ifdef __BIG_ENDIAN_BITFIELD +#define PKT_VLAN_PRESENT_BIT 7 +#else +#define PKT_VLAN_PRESENT_BIT 0 +#endif +#define PKT_VLAN_PRESENT_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset) + #ifdef __KERNEL__ /* * Handling routines are only of interest to the kernel -- cgit v1.2.3 From 03f61041c17914355dde7261be9ccdc821ddd454 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 20 Nov 2021 16:31:49 -0800 Subject: skbuff: Switch structure bounds to struct_group() In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memcpy(), memmove(), and memset(), avoid intentionally writing across neighboring fields. Replace the existing empty member position markers "headers_start" and "headers_end" with a struct_group(). This will allow memcpy() and sizeof() to more easily reason about sizes, and improve readability. "pahole" shows no size nor member offset changes to struct sk_buff. "objdump -d" shows no object code changes (outside of WARNs affected by source line number changes). Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Reviewed-by: Jason A. Donenfeld # drivers/net/wireguard/* Link: https://lore.kernel.org/lkml/20210728035006.GD35706@embeddedor Signed-off-by: David S. Miller --- include/linux/skbuff.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c7889afe0d0d..eba256af64a5 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -811,12 +811,10 @@ struct sk_buff { __u8 active_extensions; #endif - /* fields enclosed in headers_start/headers_end are copied + /* Fields enclosed in headers group are copied * using a single memcpy() in __copy_skb_header() */ - /* private: */ - __u32 headers_start[0]; - /* public: */ + struct_group(headers, /* private: */ __u8 __pkt_type_offset[0]; @@ -921,9 +919,7 @@ struct sk_buff { u64 kcov_handle; #endif - /* private: */ - __u32 headers_end[0]; - /* public: */ + ); /* end headers group */ /* These elements must be at the end, see alloc_skb() for details. */ sk_buff_data_t tail; -- cgit v1.2.3 From e523af4ee56090fbdd9cf474752448d35930bcd4 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Mon, 18 Oct 2021 18:16:02 -0500 Subject: net/ice: Add support for enable_iwarp and enable_roce devlink param Allow support for 'enable_iwarp' and 'enable_roce' devlink params to turn on/off iWARP or RoCE protocol support for E800 devices. For example, a user can turn on iWARP functionality with, devlink dev param set pci/0000:07:00.0 name enable_iwarp value true cmode runtime This add an iWARP auxiliary rdma device, ice.iwarp.<>, under this PF. A user request to enable both iWARP and RoCE under the same PF is rejected since this device does not support both protocols simultaneously on the same port. Signed-off-by: Shiraz Saleem Tested-by: Leszek Kaliszczuk Signed-off-by: Tony Nguyen --- include/linux/net/intel/iidc.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net/intel/iidc.h b/include/linux/net/intel/iidc.h index e32f6712aee0..1289593411d3 100644 --- a/include/linux/net/intel/iidc.h +++ b/include/linux/net/intel/iidc.h @@ -26,6 +26,11 @@ enum iidc_reset_type { IIDC_GLOBR, }; +enum iidc_rdma_protocol { + IIDC_RDMA_PROTOCOL_IWARP = BIT(0), + IIDC_RDMA_PROTOCOL_ROCEV2 = BIT(1), +}; + #define IIDC_MAX_USER_PRIORITY 8 /* Struct to hold per RDMA Qset info */ @@ -70,8 +75,6 @@ int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type); int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos); -#define IIDC_RDMA_ROCE_NAME "roce" - /* Structure representing auxiliary driver tailored information about the core * PCI dev, each auxiliary driver using the IIDC interface will have an * instance of this struct dedicated to it. -- cgit v1.2.3 From 6326948f940dc3f77066d5cdc44ba6afe67830c0 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 29 Sep 2021 11:01:21 -0400 Subject: lsm: security_task_getsecid_subj() -> security_current_getsecid_subj() The security_task_getsecid_subj() LSM hook invites misuse by allowing callers to specify a task even though the hook is only safe when the current task is referenced. Fix this by removing the task_struct argument to the hook, requiring LSM implementations to use the current task. While we are changing the hook declaration we also rename the function to security_current_getsecid_subj() in an effort to reinforce that the hook captures the subjective credentials of the current task and not an arbitrary task on the system. Reviewed-by: Serge Hallyn Reviewed-by: Casey Schaufler Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 3 +-- include/linux/lsm_hooks.h | 8 +++----- include/linux/security.h | 4 ++-- 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index df8de62f4710..ae2228f0711d 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -206,8 +206,7 @@ LSM_HOOK(int, 0, task_fix_setgid, struct cred *new, const struct cred * old, LSM_HOOK(int, 0, task_setpgid, struct task_struct *p, pid_t pgid) LSM_HOOK(int, 0, task_getpgid, struct task_struct *p) LSM_HOOK(int, 0, task_getsid, struct task_struct *p) -LSM_HOOK(void, LSM_RET_VOID, task_getsecid_subj, - struct task_struct *p, u32 *secid) +LSM_HOOK(void, LSM_RET_VOID, current_getsecid_subj, u32 *secid) LSM_HOOK(void, LSM_RET_VOID, task_getsecid_obj, struct task_struct *p, u32 *secid) LSM_HOOK(int, 0, task_setnice, struct task_struct *p, int nice) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index d45b6f6e27fd..52c1990644b9 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -719,11 +719,9 @@ * @p. * @p contains the task_struct for the process. * Return 0 if permission is granted. - * @task_getsecid_subj: - * Retrieve the subjective security identifier of the task_struct in @p - * and return it in @secid. Special care must be taken to ensure that @p - * is the either the "current" task, or the caller has exclusive access - * to @p. + * @current_getsecid_subj: + * Retrieve the subjective security identifier of the current task and + * return it in @secid. * In case of failure, @secid will be set to zero. * @task_getsecid_obj: * Retrieve the objective security identifier of the task_struct in @p diff --git a/include/linux/security.h b/include/linux/security.h index bbf44a466832..bb301963e333 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -418,7 +418,7 @@ int security_task_fix_setgid(struct cred *new, const struct cred *old, int security_task_setpgid(struct task_struct *p, pid_t pgid); int security_task_getpgid(struct task_struct *p); int security_task_getsid(struct task_struct *p); -void security_task_getsecid_subj(struct task_struct *p, u32 *secid); +void security_current_getsecid_subj(u32 *secid); void security_task_getsecid_obj(struct task_struct *p, u32 *secid); int security_task_setnice(struct task_struct *p, int nice); int security_task_setioprio(struct task_struct *p, int ioprio); @@ -1119,7 +1119,7 @@ static inline int security_task_getsid(struct task_struct *p) return 0; } -static inline void security_task_getsecid_subj(struct task_struct *p, u32 *secid) +static inline void security_current_getsecid_subj(u32 *secid) { *secid = 0; } -- cgit v1.2.3 From 91389c390521a02ecfb91270f5b9d7fae4312ae5 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Thu, 18 Nov 2021 21:33:37 -0600 Subject: clk: sunxi-ng: Allow the CCU core to be built as a module Like the individual CCU drivers, it can be beneficial for memory consumption of cross-platform configurations to only load the CCU core on the relevant platform. For example, a generic arm64 kernel sees the following improvement when building the CCU core and drivers as modules: before: text data bss dec hex filename 13882360 5251670 360800 19494830 12977ae vmlinux after: text data bss dec hex filename 13734787 5086442 360800 19182029 124b1cd vmlinux So the result is a 390KB total reduction in kernel image size. The one early clock provider (sun5i) requires the core to be built in. Now that loading the MMC driver will trigger loading the CCU core, the MMC timing mode functions do not need a compile-time fallback. Signed-off-by: Samuel Holland Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20211119033338.25486-5-samuel@sholland.org --- include/linux/clk/sunxi-ng.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk/sunxi-ng.h b/include/linux/clk/sunxi-ng.h index 3cd14acde0a1..cf32123b39f5 100644 --- a/include/linux/clk/sunxi-ng.h +++ b/include/linux/clk/sunxi-ng.h @@ -6,22 +6,7 @@ #ifndef _LINUX_CLK_SUNXI_NG_H_ #define _LINUX_CLK_SUNXI_NG_H_ -#include - -#ifdef CONFIG_SUNXI_CCU int sunxi_ccu_set_mmc_timing_mode(struct clk *clk, bool new_mode); int sunxi_ccu_get_mmc_timing_mode(struct clk *clk); -#else -static inline int sunxi_ccu_set_mmc_timing_mode(struct clk *clk, - bool new_mode) -{ - return -ENOTSUPP; -} - -static inline int sunxi_ccu_get_mmc_timing_mode(struct clk *clk) -{ - return -ENOTSUPP; -} -#endif #endif -- cgit v1.2.3 From c214f124161d446b340597e7c968e0a2dc149142 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 9 Nov 2021 19:57:10 +0000 Subject: arch_topology: Introduce thermal pressure update function The thermal pressure is a mechanism which is used for providing information about reduced CPU performance to the scheduler. Usually code has to convert the value from frequency units into capacity units, which are understandable by the scheduler. Create a common conversion code which can be just used via a handy API. Internally, the topology_update_thermal_pressure() operates on frequency in MHz and max CPU frequency is taken from 'freq_factor' (per-cpu). Signed-off-by: Lukasz Luba Reviewed-by: Thara Gopinath Signed-off-by: Viresh Kumar --- include/linux/arch_topology.h | 3 +++ include/linux/sched/topology.h | 7 +++++++ 2 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index b97cea83b25e..ace1e5dcf773 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -59,6 +59,9 @@ static inline unsigned long topology_get_thermal_pressure(int cpu) void topology_set_thermal_pressure(const struct cpumask *cpus, unsigned long th_pressure); +void topology_update_thermal_pressure(const struct cpumask *cpus, + unsigned long capped_freq); + struct cpu_topology { int thread_id; int core_id; diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index c07bfa2d80f2..6e89a8e43aa7 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -273,6 +273,13 @@ void arch_set_thermal_pressure(const struct cpumask *cpus, { } #endif +#ifndef arch_update_thermal_pressure +static __always_inline +void arch_update_thermal_pressure(const struct cpumask *cpus, + unsigned long capped_frequency) +{ } +#endif + static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p)); -- cgit v1.2.3 From 7e97b3dc2556743dd02612c92a8de7026e8d7dc9 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 9 Nov 2021 19:57:14 +0000 Subject: arch_topology: Remove unused topology_set_thermal_pressure() and related There is no need of this function (and related) since code has been converted to use the new arch_update_thermal_pressure() API. The old code can be removed. Signed-off-by: Lukasz Luba Signed-off-by: Viresh Kumar --- include/linux/arch_topology.h | 3 --- include/linux/sched/topology.h | 7 ------- 2 files changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index ace1e5dcf773..cce6136b300a 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -56,9 +56,6 @@ static inline unsigned long topology_get_thermal_pressure(int cpu) return per_cpu(thermal_pressure, cpu); } -void topology_set_thermal_pressure(const struct cpumask *cpus, - unsigned long th_pressure); - void topology_update_thermal_pressure(const struct cpumask *cpus, unsigned long capped_freq); diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 6e89a8e43aa7..8054641c0a7b 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -266,13 +266,6 @@ unsigned long arch_scale_thermal_pressure(int cpu) } #endif -#ifndef arch_set_thermal_pressure -static __always_inline -void arch_set_thermal_pressure(const struct cpumask *cpus, - unsigned long th_pressure) -{ } -#endif - #ifndef arch_update_thermal_pressure static __always_inline void arch_update_thermal_pressure(const struct cpumask *cpus, -- cgit v1.2.3 From cff6f593251cdf5398dc3c57f7032b8e9dcb633e Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Tue, 23 Nov 2021 12:36:47 +0200 Subject: regulator: rohm-generic: iniline stub function The function rohm_regulator_set_voltage_sel_restricted() has a stub implementation. Linux-next testing spot following: include/linux/mfd/rohm-generic.h:93:12: error: 'rohm_regulator_set_voltage_sel_restricted' defined but not used Fix this by inlining the stub. Fixes: 8b6e88555971 ("regulator: rohm-regulator: add helper for restricted voltage setting") Signed-off-by: Matti Vaittinen Link: https://lore.kernel.org/r/YZzEP3S7U15bTDAI@fedora Signed-off-by: Mark Brown --- include/linux/mfd/rohm-generic.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h index 35c5866f48b7..080d60adcd5f 100644 --- a/include/linux/mfd/rohm-generic.h +++ b/include/linux/mfd/rohm-generic.h @@ -90,7 +90,8 @@ static inline int rohm_regulator_set_dvs_levels(const struct rohm_dvs_config *dv { return 0; } -static int rohm_regulator_set_voltage_sel_restricted(struct regulator_dev *rdev, + +static inline int rohm_regulator_set_voltage_sel_restricted(struct regulator_dev *rdev, unsigned int sel) { return 0; -- cgit v1.2.3 From 2106efda785b55a8957efed9a52dfa28ee0d7280 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 22 Nov 2021 17:24:47 -0800 Subject: net: remove .ndo_change_proto_down .ndo_change_proto_down was added seemingly to enable out-of-tree implementations. Over 2.5yrs later we still have no real users upstream. Hardwire the generic implementation for now, we can revert once real users materialize. (rocker is a test vehicle, not a user.) We need to drop the optimization on the sysfs side, because unlike ndos priv_flags will be changed at runtime, so we'd need READ_ONCE/WRITE_ONCE everywhere.. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index df049864661d..db3bff1ae7fd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1297,11 +1297,6 @@ struct netdev_net_notifier { * TX queue. * int (*ndo_get_iflink)(const struct net_device *dev); * Called to get the iflink value of this device. - * void (*ndo_change_proto_down)(struct net_device *dev, - * bool proto_down); - * This function is used to pass protocol port error state information - * to the switch driver. The switch driver can react to the proto_down - * by doing a phys down on the associated switch port. * int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); * This function is used to get egress tunnel information for given skb. * This is useful for retrieving outer tunnel header parameters while @@ -1542,8 +1537,6 @@ struct net_device_ops { int queue_index, u32 maxrate); int (*ndo_get_iflink)(const struct net_device *dev); - int (*ndo_change_proto_down)(struct net_device *dev, - bool proto_down); int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); void (*ndo_set_rx_headroom)(struct net_device *dev, @@ -1612,6 +1605,7 @@ struct net_device_ops { * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with * skb_headlen(skb) == 0 (data starts from frag0) + * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1646,6 +1640,7 @@ enum netdev_priv_flags { IFF_L3MDEV_RX_HANDLER = 1<<29, IFF_LIVE_RENAME_OK = 1<<30, IFF_TX_SKB_NO_LINEAR = 1<<31, + IFF_CHANGE_PROTO_DOWN = BIT_ULL(32), }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1982,7 +1977,7 @@ struct net_device { /* Read-mostly cache-line for fast-path access */ unsigned int flags; - unsigned int priv_flags; + unsigned long long priv_flags; const struct net_device_ops *netdev_ops; int ifindex; unsigned short gflags; @@ -3735,7 +3730,6 @@ int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse); bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); int dev_change_proto_down(struct net_device *dev, bool proto_down); -int dev_change_proto_down_generic(struct net_device *dev, bool proto_down); void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, u32 value); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); -- cgit v1.2.3 From 985e9ece1e55a94da842f6c1f9ff84d587b26267 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 17 Nov 2021 20:07:35 +0200 Subject: ACPI: Make acpi_node_get_parent() local acpi_node_get_parent() isn't used outside drivers/acpi/property.c. Make it local. Signed-off-by: Sakari Ailus Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 668d007f0917..b28f8790192a 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1182,7 +1182,6 @@ int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname, struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, struct fwnode_handle *child); -struct fwnode_handle *acpi_node_get_parent(const struct fwnode_handle *fwnode); struct acpi_probe_entry; typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *, @@ -1287,12 +1286,6 @@ acpi_get_next_subnode(const struct fwnode_handle *fwnode, return NULL; } -static inline struct fwnode_handle * -acpi_node_get_parent(const struct fwnode_handle *fwnode) -{ - return NULL; -} - static inline struct fwnode_handle * acpi_graph_get_next_endpoint(const struct fwnode_handle *fwnode, struct fwnode_handle *prev) -- cgit v1.2.3 From c6d5f1933085f9a92ed5c256a859ab31c7a35f88 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Mon, 22 Nov 2021 12:19:31 +0100 Subject: net: stmmac: Calculate CDC error only once The clock domain crossing error (CDC) is calculated at every fetch of Tx or Rx timestamps. It includes a division. Especially on arm32 based systems it is expensive. It also requires two conditionals in the hotpath. Add a compensation value cache to struct plat_stmmacenet_data and subtract it unconditionally in the RX/TX functions which spares the conditionals. The value is initialized to 0 and if supported calculated in the PTP initialization code. Suggested-by: Thomas Gleixner Signed-off-by: Kurt Kanzenbach Link: https://lore.kernel.org/r/20211122111931.135135-1-kurt@linutronix.de Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index a6f03b36fc4f..89b8e208cd7b 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -241,6 +241,7 @@ struct plat_stmmacenet_data { unsigned int clk_ref_rate; unsigned int mult_fact_100ns; s32 ptp_max_adj; + u32 cdc_error_adj; struct reset_control *stmmac_rst; struct reset_control *stmmac_ahb_rst; struct stmmac_axi *axi; -- cgit v1.2.3 From 393c3714081a53795bbff0e985d24146def6f57f Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 18 Nov 2021 15:00:08 -0800 Subject: kernfs: switch global kernfs_rwsem lock to per-fs lock The kernfs implementation has big lock granularity(kernfs_rwsem) so every kernfs-based(e.g., sysfs, cgroup) fs are able to compete the lock. It makes trouble for some cases to wait the global lock for a long time even though they are totally independent contexts each other. A general example is process A goes under direct reclaim with holding the lock when it accessed the file in sysfs and process B is waiting the lock with exclusive mode and then process C is waiting the lock until process B could finish the job after it gets the lock from process A. This patch switches the global kernfs_rwsem to per-fs lock, which put the rwsem into kernfs_root. Suggested-by: Tejun Heo Acked-by: Tejun Heo Signed-off-by: Minchan Kim Link: https://lore.kernel.org/r/20211118230008.2679780-1-minchan@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/kernfs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 3ccce6f24548..9f650986a81b 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -16,6 +16,7 @@ #include #include #include +#include struct file; struct dentry; @@ -197,6 +198,7 @@ struct kernfs_root { struct list_head supers; wait_queue_head_t deactivate_waitq; + struct rw_semaphore kernfs_rwsem; }; struct kernfs_open_file { -- cgit v1.2.3 From 1b6ed6bf32fb22ef8e3572fc9c0f6454adf1ca40 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Wed, 24 Nov 2021 09:17:37 +0200 Subject: regulator: Drop unnecessary struct member The irq_flags from the regulator IRQ helper description struct was never used. The IRQ flags are passed as parameters to helper registration instead. Remove the unnecessary struct field. Fixes: 7111c6d1b31b ("regulator: IRQ based event/error notification helpers") Signed-off-by: Matti Vaittinen Link: https://lore.kernel.org/r/5f6371e178453fa2b165da50452f7db4e986debb.1637736436.git.matti.vaittinen@fi.rohmeurope.com Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index 1cb8071fee34..53b25cd7ead0 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -554,7 +554,6 @@ struct regulator_irq_data { */ struct regulator_irq_desc { const char *name; - int irq_flags; int fatal_cnt; int reread_ms; int irq_off_ms; -- cgit v1.2.3 From 6fadec4c5561e2fbe1dfa8a7da9bc58d094a8f04 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Wed, 24 Nov 2021 09:16:45 +0200 Subject: regulator: Add regulator_err2notif() helper Help drivers avoid storing both supported notification and supported error flags by supporting conversion from regulator error to notification. This may help saving some bytes. Add helper for finding the regulator notification corresponding to a regulator error. Signed-off-by: Matti Vaittinen Link: https://lore.kernel.org/r/eb1755ac0569ff07ffa466cf8912c6fd50e7c7c6.1637736436.git.matti.vaittinen@fi.rohmeurope.com Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index 53b25cd7ead0..6c6ec9658c30 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -645,6 +645,40 @@ struct regulator_dev { spinlock_t err_lock; }; +/* + * Convert error flags to corresponding notifications. + * + * Can be used by drivers which use the notification helpers to + * find out correct notification flags based on the error flags. Drivers + * can avoid storing both supported notification and error flags which + * may save few bytes. + */ +static inline int regulator_err2notif(int err) +{ + switch (err) { + case REGULATOR_ERROR_UNDER_VOLTAGE: + return REGULATOR_EVENT_UNDER_VOLTAGE; + case REGULATOR_ERROR_OVER_CURRENT: + return REGULATOR_EVENT_OVER_CURRENT; + case REGULATOR_ERROR_REGULATION_OUT: + return REGULATOR_EVENT_REGULATION_OUT; + case REGULATOR_ERROR_FAIL: + return REGULATOR_EVENT_FAIL; + case REGULATOR_ERROR_OVER_TEMP: + return REGULATOR_EVENT_OVER_TEMP; + case REGULATOR_ERROR_UNDER_VOLTAGE_WARN: + return REGULATOR_EVENT_UNDER_VOLTAGE_WARN; + case REGULATOR_ERROR_OVER_CURRENT_WARN: + return REGULATOR_EVENT_OVER_CURRENT_WARN; + case REGULATOR_ERROR_OVER_VOLTAGE_WARN: + return REGULATOR_EVENT_OVER_VOLTAGE_WARN; + case REGULATOR_ERROR_OVER_TEMP_WARN: + return REGULATOR_EVENT_OVER_TEMP_WARN; + } + return 0; +} + + struct regulator_dev * regulator_register(const struct regulator_desc *regulator_desc, const struct regulator_config *config); -- cgit v1.2.3 From a764ff77d697a4a13e69b3379cc613f7409c6b9a Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Wed, 24 Nov 2021 09:17:13 +0200 Subject: regulator: irq_helper: Provide helper for trivial IRQ notifications Provide a generic map_event helper for regulators which have a notification IRQ with single, well defined purpose. Eg, IRQ always indicates exactly one event for exactly one regulator device. For such IRQs the mapping is trivial. Signed-off-by: Matti Vaittinen Link: https://lore.kernel.org/r/603b7ed1938013a00371c1e7ccc63dfb16982b87.1637736436.git.matti.vaittinen@fi.rohmeurope.com Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index 6c6ec9658c30..4078c7776453 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -700,6 +700,8 @@ void *regulator_irq_helper(struct device *dev, int irq_flags, int common_errs, int *per_rdev_errs, struct regulator_dev **rdev, int rdev_amount); void regulator_irq_helper_cancel(void **handle); +int regulator_irq_map_event_simple(int irq, struct regulator_irq_data *rid, + unsigned long *dev_mask); void *rdev_get_drvdata(struct regulator_dev *rdev); struct device *rdev_get_dev(struct regulator_dev *rdev); -- cgit v1.2.3 From 432dd1fc134ef902b049b26839edfd3fdc1f8dc0 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Wed, 24 Nov 2021 07:57:49 +0200 Subject: regulator: rohm-generic: remove unused dummies Function rohm_regulator_set_voltage_sel_restricted() and rohm_regulator_set_dvs_levels() had inlined dummy implementations for cases when the real implementation was not configured in. All of the drivers who issue the call to these functions do SELECT the real implementation from the Kconfig. There should be no cases where the real implementation was not selected by the drivers using these functions - such a situation is likely to be an error which deserves to be noticed at compile-time. These dummies could in theory be used for compile-testing the drivers only (without the generic rohm regulator pieces). However, for such compile testing we should manually drop the selection from KConfig - and I guess that if it does not work out-of-the-box, then it is not going to happen. Especially when there should be no reason to omit compile-testing the generic rohm_regulator part. Crash test dummies. Signed-off-by: Matti Vaittinen Link: https://lore.kernel.org/r/YZ3UXXrk/Efe7Scj@fedora Signed-off-by: Mark Brown --- include/linux/mfd/rohm-generic.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h index 080d60adcd5f..5ed97a1d0908 100644 --- a/include/linux/mfd/rohm-generic.h +++ b/include/linux/mfd/rohm-generic.h @@ -82,20 +82,6 @@ int rohm_regulator_set_dvs_levels(const struct rohm_dvs_config *dvs, int rohm_regulator_set_voltage_sel_restricted(struct regulator_dev *rdev, unsigned int sel); -#else -static inline int rohm_regulator_set_dvs_levels(const struct rohm_dvs_config *dvs, - struct device_node *np, - const struct regulator_desc *desc, - struct regmap *regmap) -{ - return 0; -} - -static inline int rohm_regulator_set_voltage_sel_restricted(struct regulator_dev *rdev, - unsigned int sel) -{ - return 0; -} #endif #endif -- cgit v1.2.3 From 2338e7bcef445059a99848a3eddde0b556277663 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Mon, 15 Nov 2021 15:10:01 +0300 Subject: device property: Remove device_add_properties() API There are no more users for it. Reviewed-by: Andy Shevchenko Signed-off-by: Heikki Krogerus Signed-off-by: Rafael J. Wysocki --- include/linux/property.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 88fa726a76df..16f736c698a2 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -378,10 +378,6 @@ property_entries_dup(const struct property_entry *properties); void property_entries_free(const struct property_entry *properties); -int device_add_properties(struct device *dev, - const struct property_entry *properties); -void device_remove_properties(struct device *dev); - bool device_dma_supported(struct device *dev); enum dev_dma_attr device_get_dma_attr(struct device *dev); -- cgit v1.2.3 From f124034faa911ed534bf8c4881ad98dbbde2a966 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 24 Nov 2021 18:44:17 -0500 Subject: Revert "virtio_ring: validate used buffer length" This reverts commit 939779f5152d161b34f612af29e7dc1ac4472fcf. Attempts to validate length in the core did not work out: there turn out to exist multiple broken devices, and in particular legacy devices are known to be broken in this respect. We have ideas for handling this better in the next version but for now let's revert to a known good state to make sure drivers work for people. Signed-off-by: Michael S. Tsirkin --- include/linux/virtio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 44d0e09da2d9..41edbc01ffa4 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -152,7 +152,6 @@ size_t virtio_max_dma_size(struct virtio_device *vdev); * @feature_table_size: number of entries in the feature table array. * @feature_table_legacy: same as feature_table but when working in legacy mode. * @feature_table_size_legacy: number of entries in feature table legacy array. - * @suppress_used_validation: set to not have core validate used length * @probe: the function to call when a device is found. Returns 0 or -errno. * @scan: optional function to call after successful probe; intended * for virtio-scsi to invoke a scan. @@ -169,7 +168,6 @@ struct virtio_driver { unsigned int feature_table_size; const unsigned int *feature_table_legacy; unsigned int feature_table_size_legacy; - bool suppress_used_validation; int (*validate)(struct virtio_device *dev); int (*probe)(struct virtio_device *dev); void (*scan)(struct virtio_device *dev); -- cgit v1.2.3 From 635e4172bd0a43af943fb164799965fc9a9a705d Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Tue, 2 Nov 2021 07:38:10 +0100 Subject: arm: remove zte zx platform left-over Commit 89d4f98ae90d ("ARM: remove zte zx platform") missed to remove some definitions for this platform's debug and serial, e.g., code dependent on the config DEBUG_ZTE_ZX. Fortunately, ./scripts/checkkconfigsymbols.py detects this and warns: DEBUG_ZTE_ZX Referencing files: arch/arm/include/debug/pl01x.S Further review by Arnd Bergmann identified even more dead code in the amba serial driver. Remove all this left-over from the zte zx platform. Reviewed-by: Arnd Bergmann Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20211102063810.932-1-lukas.bulwahn@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/amba/bus.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index edfcf7a14dcd..6c7f47846971 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -90,14 +90,8 @@ enum amba_vendor { AMBA_VENDOR_ST = 0x80, AMBA_VENDOR_QCOM = 0x51, AMBA_VENDOR_LSI = 0xb6, - AMBA_VENDOR_LINUX = 0xfe, /* This value is not official */ }; -/* This is used to generate pseudo-ID for AMBA device */ -#define AMBA_LINUX_ID(conf, rev, part) \ - (((conf) & 0xff) << 24 | ((rev) & 0xf) << 20 | \ - AMBA_VENDOR_LINUX << 12 | ((part) & 0xfff)) - extern struct bus_type amba_bustype; #define to_amba_device(d) container_of(d, struct amba_device, dev) -- cgit v1.2.3 From 4167bd25ec3bc221387ec6811c05eadfe3cf1d3e Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 18 Nov 2021 08:31:24 +0100 Subject: mxser: move ids from pci_ids.h here There is no point having MOXA PCI device IDs in include/linux/pci_ids.h. Move them to the driver and sort them all by the ID. Cc: Bjorn Helgaas Cc: linux-pci@vger.kernel.org Acked-by: Bjorn Helgaas Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20211118073125.12283-19-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- include/linux/pci_ids.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 011f2f1ea5bb..c389a9c0f290 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1964,24 +1964,6 @@ #define PCI_DEVICE_ID_APPLICOM_PCI2000PFB 0x0003 #define PCI_VENDOR_ID_MOXA 0x1393 -#define PCI_DEVICE_ID_MOXA_RC7000 0x0001 -#define PCI_DEVICE_ID_MOXA_CP102 0x1020 -#define PCI_DEVICE_ID_MOXA_CP102UL 0x1021 -#define PCI_DEVICE_ID_MOXA_CP102U 0x1022 -#define PCI_DEVICE_ID_MOXA_C104 0x1040 -#define PCI_DEVICE_ID_MOXA_CP104U 0x1041 -#define PCI_DEVICE_ID_MOXA_CP104JU 0x1042 -#define PCI_DEVICE_ID_MOXA_CP104EL 0x1043 -#define PCI_DEVICE_ID_MOXA_CT114 0x1140 -#define PCI_DEVICE_ID_MOXA_CP114 0x1141 -#define PCI_DEVICE_ID_MOXA_CP118U 0x1180 -#define PCI_DEVICE_ID_MOXA_CP118EL 0x1181 -#define PCI_DEVICE_ID_MOXA_CP132 0x1320 -#define PCI_DEVICE_ID_MOXA_CP132U 0x1321 -#define PCI_DEVICE_ID_MOXA_CP134U 0x1340 -#define PCI_DEVICE_ID_MOXA_C168 0x1680 -#define PCI_DEVICE_ID_MOXA_CP168U 0x1681 -#define PCI_DEVICE_ID_MOXA_CP168EL 0x1682 #define PCI_DEVICE_ID_MOXA_CP204J 0x2040 #define PCI_DEVICE_ID_MOXA_C218 0x2180 #define PCI_DEVICE_ID_MOXA_C320 0x3200 -- cgit v1.2.3 From 5db96ef23bda6c2a61a51693c85b78b52d03f654 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 22 Nov 2021 12:16:48 +0100 Subject: tty: drop tty_schedule_flip() Since commit a9c3f68f3cd8d (tty: Fix low_latency BUG) in 2014, tty_flip_buffer_push() is only a wrapper to tty_schedule_flip(). All users were converted in the previous patches, so remove tty_schedule_flip() completely while inlining its body into tty_flip_buffer_push(). One less exported function. Reviewed-by: Johan Hovold Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20211122111648.30379-4-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_flip.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tty_flip.h b/include/linux/tty_flip.h index 9916acb5de49..483d41cbcbb7 100644 --- a/include/linux/tty_flip.h +++ b/include/linux/tty_flip.h @@ -17,7 +17,6 @@ int tty_insert_flip_string_fixed_flag(struct tty_port *port, int tty_prepare_flip_string(struct tty_port *port, unsigned char **chars, size_t size); void tty_flip_buffer_push(struct tty_port *port); -void tty_schedule_flip(struct tty_port *port); int __tty_insert_flip_char(struct tty_port *port, unsigned char ch, char flag); static inline int tty_insert_flip_char(struct tty_port *port, -- cgit v1.2.3 From d78328bcc4d0e677f2ff83f4ae1f43c933fbd143 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 22 Nov 2021 10:45:29 +0100 Subject: tty: remove file from tty_ldisc_ops::ioctl and compat_ioctl After the previous patches, noone needs 'file' parameter in neither ioctl hook from tty_ldisc_ops. So remove 'file' from both of them. Cc: Marcel Holtmann Cc: Johan Hedberg Cc: Luiz Augusto von Dentz Cc: Wolfgang Grandegger Cc: Marc Kleine-Budde Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Andreas Koensgen Cc: Paul Mackerras Acked-by: Krzysztof Kozlowski [NFC] Acked-by: Dmitry Torokhov Acked-by: Marc Kleine-Budde Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20211122094529.24171-1-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_ldisc.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h index b85d84fb5f49..25f07017bbad 100644 --- a/include/linux/tty_ldisc.h +++ b/include/linux/tty_ldisc.h @@ -45,8 +45,7 @@ struct tty_struct; * some processing on the characters first. If this function is * not defined, the user will receive an EIO error. * - * int (*ioctl)(struct tty_struct * tty, struct file * file, - * unsigned int cmd, unsigned long arg); + * int (*ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg); * * This function is called when the user requests an ioctl which * is not handled by the tty layer or the low-level tty driver. @@ -56,8 +55,8 @@ struct tty_struct; * low-level driver can "grab" an ioctl request before the line * discpline has a chance to see it. * - * int (*compat_ioctl)(struct tty_struct * tty, struct file * file, - * unsigned int cmd, unsigned long arg); + * int (*compat_ioctl)(struct tty_struct *tty, unsigned int cmd, + * unsigned long arg); * * Process ioctl calls from 32-bit process on 64-bit system * @@ -192,10 +191,10 @@ struct tty_ldisc_ops { void **cookie, unsigned long offset); ssize_t (*write)(struct tty_struct *tty, struct file *file, const unsigned char *buf, size_t nr); - int (*ioctl)(struct tty_struct *tty, struct file *file, - unsigned int cmd, unsigned long arg); - int (*compat_ioctl)(struct tty_struct *tty, struct file *file, - unsigned int cmd, unsigned long arg); + int (*ioctl)(struct tty_struct *tty, unsigned int cmd, + unsigned long arg); + int (*compat_ioctl)(struct tty_struct *tty, unsigned int cmd, + unsigned long arg); void (*set_termios)(struct tty_struct *tty, struct ktermios *old); __poll_t (*poll)(struct tty_struct *, struct file *, struct poll_table_struct *); -- cgit v1.2.3 From 29c3002644bdd653f6ec6407d25135d0a4f7cefb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Nov 2021 12:24:46 -0800 Subject: net: optimize skb_postpull_rcsum() Remove one pair of add/adc instructions and their dependency against carry flag. We can leverage third argument to csum_partial(): X = csum_block_sub(X, csum_partial(start, len, 0), 0); --> X = csum_block_add(X, ~csum_partial(start, len, 0), 0); --> X = ~csum_partial(start, len, ~X); Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index eba256af64a5..eae4bd3237a4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3485,7 +3485,11 @@ __skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len, static inline void skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { - __skb_postpull_rcsum(skb, start, len, 0); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = ~csum_partial(start, len, ~skb->csum); + else if (skb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_start_offset(skb) < 0) + skb->ip_summed = CHECKSUM_NONE; } static __always_inline void -- cgit v1.2.3 From b4c80629c5c9d48880c5ad99943374f9ab72432e Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Sun, 23 May 2021 22:49:57 +0200 Subject: include/linux/byteorder/generic.h: fix index variables In cpu_to_be32_array() and be32_to_cpu_array() the length of the array is given by variable len of type size_t. An index variable of type int is used to iterate over the array. This is bound to fail for len > INT_MAX and lets GCC add instructions for sign extension. Correct the type of the index variable. Signed-off-by: Heinrich Schuchardt Link: https://lore.kernel.org/r/20210523204958.64575-1-xypron.glpk@gmx.de Signed-off-by: Greg Kroah-Hartman --- include/linux/byteorder/generic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h index 4b13e0a3e15b..c9a4c96c9943 100644 --- a/include/linux/byteorder/generic.h +++ b/include/linux/byteorder/generic.h @@ -190,7 +190,7 @@ static inline void be64_add_cpu(__be64 *var, u64 val) static inline void cpu_to_be32_array(__be32 *dst, const u32 *src, size_t len) { - int i; + size_t i; for (i = 0; i < len; i++) dst[i] = cpu_to_be32(src[i]); @@ -198,7 +198,7 @@ static inline void cpu_to_be32_array(__be32 *dst, const u32 *src, size_t len) static inline void be32_to_cpu_array(u32 *dst, const __be32 *src, size_t len) { - int i; + size_t i; for (i = 0; i < len; i++) dst[i] = be32_to_cpu(src[i]); -- cgit v1.2.3 From 18e6c0751cf9ae0631a2623e31af2bf504f72c30 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 26 Nov 2021 09:15:49 +0100 Subject: tty: finish kernel-doc of tty_struct members There are already pieces of kernel-doc documentation for struct tty_struct in tty.h. Finish the documentation for the members which were undocumented yet. It also includes tuning the already existing pieces like flow and ctrl, especially adding highlights to them. Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20211126081611.11001-2-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 79 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 64 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index 5dbd7c5afac7..da49ad9be281 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -122,33 +122,84 @@ struct tty_operations; /** * struct tty_struct - state associated with a tty while open * - * @flow.lock: lock for flow members - * @flow.stopped: tty stopped/started by tty_stop/tty_start - * @flow.tco_stopped: tty stopped/started by TCOOFF/TCOON ioctls (it has - * precedense over @flow.stopped) + * @magic: magic value set early in @alloc_tty_struct to %TTY_MAGIC, for + * debugging purposes + * @kref: reference counting by tty_kref_get() and tty_kref_put(), reaching zero + * frees the structure + * @dev: class device or %NULL (e.g. ptys, serdev) + * @driver: &struct tty_driver operating this tty + * @ops: &struct tty_operations of @driver for this tty (open, close, etc.) + * @index: index of this tty (e.g. to construct @name like tty12) + * @ldisc_sem: protects line discipline changes (@ldisc) -- lock tty not pty + * @ldisc: the current line discipline for this tty (n_tty by default) + * @atomic_write_lock: protects against concurrent writers, i.e. locks + * @write_cnt, @write_buf and similar + * @legacy_mutex: leftover from history (BKL -> BTM -> @legacy_mutex), + * protecting several operations on this tty + * @throttle_mutex: protects against concurrent tty_throttle_safe() and + * tty_unthrottle_safe() (but not tty_unthrottle()) + * @termios_rwsem: protects @termios and @termios_locked + * @winsize_mutex: protects @winsize + * @termios: termios for the current tty, copied from/to @driver.termios + * @termios_locked: locked termios (by %TIOCGLCKTRMIOS and %TIOCSLCKTRMIOS + * ioctls) + * @name: name of the tty constructed by tty_line_name() (e.g. ttyS3) + * @flags: bitwise OR of %TTY_THROTTLED, %TTY_IO_ERROR, ... + * @count: count of open processes, reaching zero cancels all the work for + * this tty and drops a @kref too (but does not free this tty) + * @winsize: size of the terminal "window" (cf. @winsize_mutex) + * @flow: flow settings grouped together, see also @flow.unused + * @flow.lock: lock for @flow members + * @flow.stopped: tty stopped/started by stop_tty()/start_tty() + * @flow.tco_stopped: tty stopped/started by %TCOOFF/%TCOON ioctls (it has + * precedence over @flow.stopped) * @flow.unused: alignment for Alpha, so that no members other than @flow.* are * modified by the same 64b word store. The @flow's __aligned is * there for the very same reason. - * @ctrl.lock: lock for ctrl members + * @ctrl: control settings grouped together, see also @ctrl.unused + * @ctrl.lock: lock for @ctrl members * @ctrl.pgrp: process group of this tty (setpgrp(2)) * @ctrl.session: session of this tty (setsid(2)). Writes are protected by both - * @ctrl.lock and legacy mutex, readers must use at least one of + * @ctrl.lock and @legacy_mutex, readers must use at least one of * them. - * @ctrl.pktstatus: packet mode status (bitwise OR of TIOCPKT_* constants) + * @ctrl.pktstatus: packet mode status (bitwise OR of %TIOCPKT_ constants) * @ctrl.packet: packet mode enabled + * @ctrl.unused: alignment for Alpha, see @flow.unused for explanation + * @hw_stopped: not controlled by the tty layer, under @driver's control for CTS + * handling + * @receive_room: bytes permitted to feed to @ldisc without any being lost + * @flow_change: controls behavior of throttling, see tty_throttle_safe() and + * tty_unthrottle_safe() + * @link: link to another pty (master -> slave and vice versa) + * @fasync: state for %O_ASYNC (for %SIGIO); managed by fasync_helper() + * @write_wait: concurrent writers are waiting in this queue until they are + * allowed to write + * @read_wait: readers wait for data in this queue + * @hangup_work: normally a work to perform a hangup (do_tty_hangup()); while + * freeing the tty, (re)used to release_one_tty() + * @disc_data: pointer to @ldisc's private data (e.g. to &struct n_tty_data) + * @driver_data: pointer to @driver's private data (e.g. &struct uart_state) + * @files_lock: protects @tty_files list + * @tty_files: list of (re)openers of this tty (i.e. linked &struct + * tty_file_private) + * @closing: when set during close, n_tty processes only START & STOP chars + * @write_buf: temporary buffer used during tty_write() to copy user data to + * @write_cnt: count of bytes written in tty_write() to @write_buf + * @SAK_work: if the tty has a pending do_SAK, it is queued here + * @port: persistent storage for this device (i.e. &struct tty_port) * * All of the state associated with a tty while the tty is open. Persistent - * storage for tty devices is referenced here as @port in struct tty_port. + * storage for tty devices is referenced here as @port and is documented in + * &struct tty_port. */ struct tty_struct { int magic; struct kref kref; - struct device *dev; /* class device or NULL (e.g. ptys, serdev) */ + struct device *dev; struct tty_driver *driver; const struct tty_operations *ops; int index; - /* Protects ldisc changes: Lock tty not pty */ struct ld_semaphore ldisc_sem; struct tty_ldisc *ldisc; @@ -157,12 +208,11 @@ struct tty_struct { struct mutex throttle_mutex; struct rw_semaphore termios_rwsem; struct mutex winsize_mutex; - /* Termios values are protected by the termios rwsem */ struct ktermios termios, termios_locked; char name[64]; unsigned long flags; int count; - struct winsize winsize; /* winsize_mutex */ + struct winsize winsize; struct { spinlock_t lock; @@ -181,7 +231,7 @@ struct tty_struct { } __aligned(sizeof(unsigned long)) ctrl; int hw_stopped; - unsigned int receive_room; /* Bytes free for queue */ + unsigned int receive_room; int flow_change; struct tty_struct *link; @@ -191,7 +241,7 @@ struct tty_struct { struct work_struct hangup_work; void *disc_data; void *driver_data; - spinlock_t files_lock; /* protects tty_files list */ + spinlock_t files_lock; struct list_head tty_files; #define N_TTY_BUF_SIZE 4096 @@ -199,7 +249,6 @@ struct tty_struct { int closing; unsigned char *write_buf; int write_cnt; - /* If the tty has a pending do_SAK, queue it here - akpm */ struct work_struct SAK_work; struct tty_port *port; } __randomize_layout; -- cgit v1.2.3 From 61c83addb77c65f498a8db0e7113dc3acf753c45 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 26 Nov 2021 09:15:50 +0100 Subject: tty: add kernel-doc for tty_port tty_port used to have only short comments along its members. Convert them into proper kernel-doc comments in front of the structure. And add some more explanation to them where needed. The whole structure purpose and handling is documented at the end too -- some pieces of preexisting text moved to this place. Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20211126081611.11001-3-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_port.h | 104 +++++++++++++++++++++++++++++++---------------- 1 file changed, 70 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h index 6e86e9e118b6..9091e1c8de4c 100644 --- a/include/linux/tty_port.h +++ b/include/linux/tty_port.h @@ -7,17 +7,6 @@ #include #include -/* - * Port level information. Each device keeps its own port level information - * so provide a common structure for those ports wanting to use common support - * routines. - * - * The tty port has a different lifetime to the tty so must be kept apart. - * In addition be careful as tty -> port mappings are valid for the life - * of the tty object but in many cases port -> tty mappings are valid only - * until a hangup so don't use the wrong path. - */ - struct attribute_group; struct tty_driver; struct tty_port; @@ -48,30 +37,77 @@ struct tty_port_client_operations { extern const struct tty_port_client_operations tty_port_default_client_ops; +/** + * struct tty_port -- port level information + * + * @buf: buffer for this port, locked internally + * @tty: back pointer to &struct tty_struct, valid only if the tty is open. Use + * tty_port_tty_get() to obtain it (and tty_kref_put() to release). + * @itty: internal back pointer to &struct tty_struct. Avoid this. It should be + * eliminated in the long term. + * @ops: tty port operations (like activate, shutdown), see &struct + * tty_port_operations + * @client_ops: tty port client operations (like receive_buf, write_wakeup). + * By default, tty_port_default_client_ops is used. + * @lock: lock protecting @tty + * @blocked_open: # of procs waiting for open in tty_port_block_til_ready() + * @count: usage count + * @open_wait: open waiters queue (waiting e.g. for a carrier) + * @delta_msr_wait: modem status change queue (waiting for MSR changes) + * @flags: user TTY flags (%ASYNC_) + * @iflags: internal flags (%TTY_PORT_) + * @console: when set, the port is a console + * @mutex: locking, for open, shutdown and other port operations + * @buf_mutex: @xmit_buf alloc lock + * @xmit_buf: optional xmit buffer used by some drivers + * @close_delay: delay in jiffies to wait when closing the port + * @closing_wait: delay in jiffies for output to be sent before closing + * @drain_delay: set to zero if no pure time based drain is needed else set to + * size of fifo + * @kref: references counter. Reaching zero calls @ops->destruct() if non-%NULL + * or frees the port otherwise. + * @client_data: pointer to private data, for @client_ops + * + * Each device keeps its own port level information. &struct tty_port was + * introduced as a common structure for such information. As every TTY device + * shall have a backing tty_port structure, every driver can use these members. + * + * The tty port has a different lifetime to the tty so must be kept apart. + * In addition be careful as tty -> port mappings are valid for the life + * of the tty object but in many cases port -> tty mappings are valid only + * until a hangup so don't use the wrong path. + * + * Tty port shall be initialized by tty_port_init() and shut down either by + * tty_port_destroy() (refcounting not used), or tty_port_put() (refcounting). + * + * There is a lot of helpers around &struct tty_port too. To name the most + * significant ones: tty_port_open(), tty_port_close() (or + * tty_port_close_start() and tty_port_close_end() separately if need be), and + * tty_port_hangup(). These call @ops->activate() and @ops->shutdown() as + * needed. + */ struct tty_port { - struct tty_bufhead buf; /* Locked internally */ - struct tty_struct *tty; /* Back pointer */ - struct tty_struct *itty; /* internal back ptr */ - const struct tty_port_operations *ops; /* Port operations */ - const struct tty_port_client_operations *client_ops; /* Port client operations */ - spinlock_t lock; /* Lock protecting tty field */ - int blocked_open; /* Waiting to open */ - int count; /* Usage count */ - wait_queue_head_t open_wait; /* Open waiters */ - wait_queue_head_t delta_msr_wait; /* Modem status change */ - unsigned long flags; /* User TTY flags ASYNC_ */ - unsigned long iflags; /* Internal flags TTY_PORT_ */ - unsigned char console:1; /* port is a console */ - struct mutex mutex; /* Locking */ - struct mutex buf_mutex; /* Buffer alloc lock */ - unsigned char *xmit_buf; /* Optional buffer */ - unsigned int close_delay; /* Close port delay */ - unsigned int closing_wait; /* Delay for output */ - int drain_delay; /* Set to zero if no pure time - based drain is needed else - set to size of fifo */ - struct kref kref; /* Ref counter */ - void *client_data; + struct tty_bufhead buf; + struct tty_struct *tty; + struct tty_struct *itty; + const struct tty_port_operations *ops; + const struct tty_port_client_operations *client_ops; + spinlock_t lock; + int blocked_open; + int count; + wait_queue_head_t open_wait; + wait_queue_head_t delta_msr_wait; + unsigned long flags; + unsigned long iflags; + unsigned char console:1; + struct mutex mutex; + struct mutex buf_mutex; + unsigned char *xmit_buf; + unsigned int close_delay; + unsigned int closing_wait; + int drain_delay; + struct kref kref; + void *client_data; }; /* tty_port::iflags bits -- use atomic bit ops */ -- cgit v1.2.3 From a6563830215226aae0e7e6802955c77a6a7b7547 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 26 Nov 2021 09:15:51 +0100 Subject: tty: add kernel-doc for tty_driver tty_driver used to have only short comments along its members. Convert them into proper kernel-doc comments in front of the structure. And add some more explanation to them where needed. The whole structure handling is documented at the end too. Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20211126081611.11001-4-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_driver.h | 62 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 795b94ccdeb6..3622404a678d 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -291,23 +291,61 @@ struct tty_operations { int (*proc_show)(struct seq_file *, void *); } __randomize_layout; +/** + * struct tty_driver -- driver for TTY devices + * + * @magic: set to %TTY_DRIVER_MAGIC in __tty_alloc_driver() + * @kref: reference counting. Reaching zero frees all the internals and the + * driver. + * @cdevs: allocated/registered character /dev devices + * @owner: modules owning this driver. Used drivers cannot be rmmod'ed. + * Automatically set by tty_alloc_driver(). + * @driver_name: name of the driver used in /proc/tty + * @name: used for constructing /dev node name + * @name_base: used as a number base for constructing /dev node name + * @major: major /dev device number (zero for autoassignment) + * @minor_start: the first minor /dev device number + * @num: number of devices allocated + * @type: type of tty driver (%TTY_DRIVER_TYPE_) + * @subtype: subtype of tty driver (%SYSTEM_TYPE_, %PTY_TYPE_, %SERIAL_TYPE_) + * @init_termios: termios to set to each tty initially (e.g. %tty_std_termios) + * @flags: tty driver flags (%TTY_DRIVER_) + * @proc_entry: proc fs entry, used internally + * @other: driver of the linked tty; only used for the PTY driver + * @ttys: array of active &struct tty_struct, set by tty_standard_install() + * @ports: array of &struct tty_port; can be set during initialization by + * tty_port_link_device() and similar + * @termios: storage for termios at each TTY close for the next open + * @driver_state: pointer to driver's arbitrary data + * @ops: driver hooks for TTYs. Set them using tty_set_operations(). Use &struct + * tty_port helpers in them as much as possible. + * @tty_drivers: used internally to link tty_drivers together + * + * The usual handling of &struct tty_driver is to allocate it by + * tty_alloc_driver(), set up all the necessary members, and register it by + * tty_register_driver(). At last, the driver is torn down by calling + * tty_unregister_driver() followed by tty_driver_kref_put(). + * + * The fields required to be set before calling tty_register_driver() include + * @driver_name, @name, @type, @subtype, @init_termios, and @ops. + */ struct tty_driver { - int magic; /* magic number for this structure */ - struct kref kref; /* Reference management */ + int magic; + struct kref kref; struct cdev **cdevs; struct module *owner; const char *driver_name; const char *name; - int name_base; /* offset of printed name */ - int major; /* major device number */ - int minor_start; /* start of minor device number */ - unsigned int num; /* number of devices allocated */ - short type; /* type of tty driver */ - short subtype; /* subtype of tty driver */ - struct ktermios init_termios; /* Initial termios */ - unsigned long flags; /* tty driver flags */ - struct proc_dir_entry *proc_entry; /* /proc fs entry */ - struct tty_driver *other; /* only used for the PTY driver */ + int name_base; + int major; + int minor_start; + unsigned int num; + short type; + short subtype; + struct ktermios init_termios; + unsigned long flags; + struct proc_dir_entry *proc_entry; + struct tty_driver *other; /* * Pointer to the tty data structures -- cgit v1.2.3 From 1fe183091753b1d7f11e70593700c0c0ef268db7 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 26 Nov 2021 09:15:52 +0100 Subject: tty: add kernel-doc for tty_operations tty_operations structure was already documented in a standalone comment in the header beginning. Move it right before the structure and reformat it so it complies to kernel-doc. That way, we can include it in Documentation/ later in this series. Note that we named proc_show's parameters, so that we can reference them in the text. Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20211126081611.11001-5-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_driver.h | 398 +++++++++++++++++++++++++++------------------ 1 file changed, 241 insertions(+), 157 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 3622404a678d..5611992ab26a 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -2,248 +2,332 @@ #ifndef _LINUX_TTY_DRIVER_H #define _LINUX_TTY_DRIVER_H -/* - * This structure defines the interface between the low-level tty - * driver and the tty routines. The following routines can be - * defined; unless noted otherwise, they are optional, and can be - * filled in with a null pointer. +#include +#include +#include +#include +#include +#include +#include + +struct tty_struct; +struct tty_driver; +struct serial_icounter_struct; +struct serial_struct; + +/** + * struct tty_operations -- interface between driver and tty * - * struct tty_struct * (*lookup)(struct tty_driver *self, struct file *, int idx) + * @lookup: ``struct tty_struct *()(struct tty_driver *self, struct file *, + * int idx)`` * - * Return the tty device corresponding to idx, NULL if there is not - * one currently in use and an ERR_PTR value on error. Called under - * tty_mutex (for now!) + * Return the tty device corresponding to @idx, %NULL if there is not + * one currently in use and an %ERR_PTR value on error. Called under + * %tty_mutex (for now!) * - * Optional method. Default behaviour is to use the ttys array + * Optional method. Default behaviour is to use the @self->ttys array. * - * int (*install)(struct tty_driver *self, struct tty_struct *tty) + * @install: ``int ()(struct tty_driver *self, struct tty_struct *tty)`` * - * Install a new tty into the tty driver internal tables. Used in - * conjunction with lookup and remove methods. + * Install a new @tty into the @self's internal tables. Used in + * conjunction with @lookup and @remove methods. * - * Optional method. Default behaviour is to use the ttys array + * Optional method. Default behaviour is to use the @self->ttys array. * - * void (*remove)(struct tty_driver *self, struct tty_struct *tty) + * @remove: ``void ()(struct tty_driver *self, struct tty_struct *tty)`` * - * Remove a closed tty from the tty driver internal tables. Used in - * conjunction with lookup and remove methods. + * Remove a closed @tty from the @self's internal tables. Used in + * conjunction with @lookup and @remove methods. * - * Optional method. Default behaviour is to use the ttys array + * Optional method. Default behaviour is to use the @self->ttys array. * - * int (*open)(struct tty_struct * tty, struct file * filp); + * @open: ``int ()(struct tty_struct *tty, struct file *)`` * - * This routine is called when a particular tty device is opened. - * This routine is mandatory; if this routine is not filled in, - * the attempted open will fail with ENODEV. + * This routine is called when a particular @tty device is opened. This + * routine is mandatory; if this routine is not filled in, the attempted + * open will fail with %ENODEV. * * Required method. Called with tty lock held. * - * void (*close)(struct tty_struct * tty, struct file * filp); + * @close: ``void ()(struct tty_struct *tty, struct file *)`` * - * This routine is called when a particular tty device is closed. - * Note: called even if the corresponding open() failed. + * This routine is called when a particular @tty device is closed. + * + * Remark: called even if the corresponding @open() failed. * * Required method. Called with tty lock held. * - * void (*shutdown)(struct tty_struct * tty); + * @shutdown: ``void ()(struct tty_struct *tty)`` * - * This routine is called under the tty lock when a particular tty device - * is closed for the last time. It executes before the tty resources - * are freed so may execute while another function holds a tty kref. + * This routine is called under the tty lock when a particular @tty device + * is closed for the last time. It executes before the @tty resources + * are freed so may execute while another function holds a @tty kref. * - * void (*cleanup)(struct tty_struct * tty); + * @cleanup: ``void ()(struct tty_struct *tty)`` * - * This routine is called asynchronously when a particular tty device + * This routine is called asynchronously when a particular @tty device * is closed for the last time freeing up the resources. This is * actually the second part of shutdown for routines that might sleep. * + * @write: ``int ()(struct tty_struct *tty, const unsigned char *buf, + * int count)`` * - * int (*write)(struct tty_struct * tty, - * const unsigned char *buf, int count); - * - * This routine is called by the kernel to write a series of - * characters to the tty device. The characters may come from - * user space or kernel space. This routine will return the + * This routine is called by the kernel to write a series (@count) of + * characters (@buf) to the @tty device. The characters may come from + * user space or kernel space. This routine will return the * number of characters actually accepted for writing. * * Optional: Required for writable devices. * - * int (*put_char)(struct tty_struct *tty, unsigned char ch); + * @put_char: ``int ()(struct tty_struct *tty, unsigned char ch)`` * - * This routine is called by the kernel to write a single - * character to the tty device. If the kernel uses this routine, - * it must call the flush_chars() routine (if defined) when it is - * done stuffing characters into the driver. If there is no room - * in the queue, the character is ignored. + * This routine is called by the kernel to write a single character @ch to + * the @tty device. If the kernel uses this routine, it must call the + * @flush_chars() routine (if defined) when it is done stuffing characters + * into the driver. If there is no room in the queue, the character is + * ignored. * - * Optional: Kernel will use the write method if not provided. + * Optional: Kernel will use the @write method if not provided. Do not + * call this function directly, call tty_put_char(). * - * Note: Do not call this function directly, call tty_put_char + * @flush_chars: ``void ()(struct tty_struct *tty)`` * - * void (*flush_chars)(struct tty_struct *tty); + * This routine is called by the kernel after it has written a + * series of characters to the tty device using @put_char(). * - * This routine is called by the kernel after it has written a - * series of characters to the tty device using put_char(). + * Optional. Do not call this function directly, call + * tty_driver_flush_chars(). * - * Optional: + * @write_room: ``unsigned int ()(struct tty_struct *tty)`` * - * Note: Do not call this function directly, call tty_driver_flush_chars - * - * unsigned int (*write_room)(struct tty_struct *tty); - * - * This routine returns the numbers of characters the tty driver - * will accept for queuing to be written. This number is subject - * to change as output buffers get emptied, or if the output flow + * This routine returns the numbers of characters the @tty driver + * will accept for queuing to be written. This number is subject + * to change as output buffers get emptied, or if the output flow * control is acted. * - * Required if write method is provided else not needed. + * Required if @write method is provided else not needed. Do not call this + * function directly, call tty_write_room() * - * Note: Do not call this function directly, call tty_write_room - * - * int (*ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg); + * @chars_in_buffer: ``unsigned int ()(struct tty_struct *tty)`` * - * This routine allows the tty driver to implement - * device-specific ioctls. If the ioctl number passed in cmd - * is not recognized by the driver, it should return ENOIOCTLCMD. + * This routine returns the number of characters in the device private + * output queue. Used in tty_wait_until_sent() and for poll() + * implementation. * - * Optional + * Optional: if not provided, it is assumed there is no queue on the + * device. Do not call this function directly, call tty_chars_in_buffer(). * - * long (*compat_ioctl)(struct tty_struct *tty,, - * unsigned int cmd, unsigned long arg); + * @ioctl: ``int ()(struct tty_struct *tty, unsigned int cmd, + * unsigned long arg)`` * - * implement ioctl processing for 32 bit process on 64 bit system + * This routine allows the @tty driver to implement device-specific + * ioctls. If the ioctl number passed in @cmd is not recognized by the + * driver, it should return %ENOIOCTLCMD. * - * Optional - * - * void (*set_termios)(struct tty_struct *tty, struct ktermios * old); + * Optional. * - * This routine allows the tty driver to be notified when - * device's termios settings have changed. + * @compat_ioctl: ``long ()(struct tty_struct *tty, unsigned int cmd, + * unsigned long arg)`` * - * Optional: Called under the termios lock + * Implement ioctl processing for 32 bit process on 64 bit system. * + * Optional. * - * void (*set_ldisc)(struct tty_struct *tty); + * @set_termios: ``void ()(struct tty_struct *tty, struct ktermios *old)`` * - * This routine allows the tty driver to be notified when the - * device's termios settings have changed. + * This routine allows the @tty driver to be notified when device's + * termios settings have changed. * - * Optional: Called under BKL (currently) - * - * void (*throttle)(struct tty_struct * tty); + * Optional: Called under the @tty->termios_rwsem. * - * This routine notifies the tty driver that input buffers for - * the line discipline are close to full, and it should somehow - * signal that no more characters should be sent to the tty. + * @set_ldisc: ``void ()(struct tty_struct *tty)`` * - * Optional: Always invoke via tty_throttle_safe(), called under the - * termios lock. - * - * void (*unthrottle)(struct tty_struct * tty); + * This routine allows the @tty driver to be notified when the device's + * line discipline is being changed. * - * This routine notifies the tty drivers that it should signals - * that characters can now be sent to the tty without fear of - * overrunning the input buffers of the line disciplines. - * - * Optional: Always invoke via tty_unthrottle(), called under the - * termios lock. + * Optional. Called under the @tty->ldisc_sem and @tty->termios_rwsem. * - * void (*stop)(struct tty_struct *tty); + * @throttle: ``void ()(struct tty_struct *tty)`` * - * This routine notifies the tty driver that it should stop - * outputting characters to the tty device. + * This routine notifies the @tty driver that input buffers for the line + * discipline are close to full, and it should somehow signal that no more + * characters should be sent to the @tty. * - * Called with ->flow.lock held. Serialized with start() method. + * Optional: Always invoke via tty_throttle_safe(). Called under the + * @tty->termios_rwsem. * - * Optional: + * @unthrottle: ``void ()(struct tty_struct *tty)`` * - * Note: Call stop_tty not this method. - * - * void (*start)(struct tty_struct *tty); + * This routine notifies the @tty driver that it should signal that + * characters can now be sent to the @tty without fear of overrunning the + * input buffers of the line disciplines. * - * This routine notifies the tty driver that it resume sending + * Optional. Always invoke via tty_unthrottle(). Called under the + * @tty->termios_rwsem. + * + * @stop: ``void ()(struct tty_struct *tty)`` + * + * This routine notifies the @tty driver that it should stop outputting * characters to the tty device. * - * Called with ->flow.lock held. Serialized with stop() method. + * Called with @tty->flow.lock held. Serialized with @start() method. * - * Optional: + * Optional. Always invoke via stop_tty(). * - * Note: Call start_tty not this method. - * - * void (*hangup)(struct tty_struct *tty); + * @start: ``void ()(struct tty_struct *tty)`` * - * This routine notifies the tty driver that it should hang up the - * tty device. + * This routine notifies the @tty driver that it resumed sending + * characters to the @tty device. * - * Optional: + * Called with @tty->flow.lock held. Serialized with stop() method. * - * Called with tty lock held. + * Optional. Always invoke via start_tty(). * - * int (*break_ctl)(struct tty_struct *tty, int state); + * @hangup: ``void ()(struct tty_struct *tty)`` * - * This optional routine requests the tty driver to turn on or - * off BREAK status on the RS-232 port. If state is -1, - * then the BREAK status should be turned on; if state is 0, then - * BREAK should be turned off. + * This routine notifies the @tty driver that it should hang up the @tty + * device. * - * If this routine is implemented, the high-level tty driver will - * handle the following ioctls: TCSBRK, TCSBRKP, TIOCSBRK, - * TIOCCBRK. + * Optional. Called with tty lock held. * - * If the driver sets TTY_DRIVER_HARDWARE_BREAK then the interface - * will also be called with actual times and the hardware is expected - * to do the delay work itself. 0 and -1 are still used for on/off. + * @break_ctl: ``int ()(struct tty_struct *tty, int state)`` * - * Optional: Required for TCSBRK/BRKP/etc handling. + * This optional routine requests the @tty driver to turn on or off BREAK + * status on the RS-232 port. If @state is -1, then the BREAK status + * should be turned on; if @state is 0, then BREAK should be turned off. * - * void (*wait_until_sent)(struct tty_struct *tty, int timeout); - * - * This routine waits until the device has written out all of the - * characters in its transmitter FIFO. + * If this routine is implemented, the high-level tty driver will handle + * the following ioctls: %TCSBRK, %TCSBRKP, %TIOCSBRK, %TIOCCBRK. + * + * If the driver sets %TTY_DRIVER_HARDWARE_BREAK in tty_alloc_driver(), + * then the interface will also be called with actual times and the + * hardware is expected to do the delay work itself. 0 and -1 are still + * used for on/off. + * + * Optional: Required for %TCSBRK/%BRKP/etc. handling. + * + * @flush_buffer: ``void ()(struct tty_struct *tty)`` + * + * This routine discards device private output buffer. Invoked on close, + * hangup, to implement %TCOFLUSH ioctl and similar. + * + * Optional: if not provided, it is assumed there is no queue on the + * device. Do not call this function directly, call + * tty_driver_flush_buffer(). + * + * @wait_until_sent: ``void ()(struct tty_struct *tty, int timeout)`` + * + * This routine waits until the device has written out all of the + * characters in its transmitter FIFO. Or until @timeout (in jiffies) is + * reached. + * + * Optional: If not provided, the device is assumed to have no FIFO. + * Usually correct to invoke via tty_wait_until_sent(). + * + * @send_xchar: ``void ()(struct tty_struct *tty, char ch)`` + * + * This routine is used to send a high-priority XON/XOFF character (@ch) + * to the @tty device. * - * Optional: If not provided the device is assumed to have no FIFO + * Optional: If not provided, then the @write method is called under + * the @tty->atomic_write_lock to keep it serialized with the ldisc. * - * Note: Usually correct to call tty_wait_until_sent + * @tiocmget: ``int ()(struct tty_struct *tty)`` * - * void (*send_xchar)(struct tty_struct *tty, char ch); + * This routine is used to obtain the modem status bits from the @tty + * driver. * - * This routine is used to send a high-priority XON/XOFF - * character to the device. + * Optional: If not provided, then %ENOTTY is returned from the %TIOCMGET + * ioctl. Do not call this function directly, call tty_tiocmget(). * - * Optional: If not provided then the write method is called under - * the atomic write lock to keep it serialized with the ldisc. + * @tiocmset: ``int ()(struct tty_struct *tty, + * unsigned int set, unsigned int clear)`` * - * int (*resize)(struct tty_struct *tty, struct winsize *ws) + * This routine is used to set the modem status bits to the @tty driver. + * First, @clear bits should be cleared, then @set bits set. * - * Called when a termios request is issued which changes the - * requested terminal geometry. + * Optional: If not provided, then %ENOTTY is returned from the %TIOCMSET + * ioctl. Do not call this function directly, call tty_tiocmset(). + * + * @resize: ``int ()(struct tty_struct *tty, struct winsize *ws)`` + * + * Called when a termios request is issued which changes the requested + * terminal geometry to @ws. * * Optional: the default action is to update the termios structure * without error. This is usually the correct behaviour. Drivers should - * not force errors here if they are not resizable objects (eg a serial + * not force errors here if they are not resizable objects (e.g. a serial * line). See tty_do_resize() if you need to wrap the standard method - * in your own logic - the usual case. + * in your own logic -- the usual case. + * + * @get_icount: ``int ()(struct tty_struct *tty, + * struct serial_icounter *icount)`` + * + * Called when the @tty device receives a %TIOCGICOUNT ioctl. Passed a + * kernel structure @icount to complete. + * + * Optional: called only if provided, otherwise %ENOTTY will be returned. + * + * @get_serial: ``int ()(struct tty_struct *tty, struct serial_struct *p)`` + * + * Called when the @tty device receives a %TIOCGSERIAL ioctl. Passed a + * kernel structure @p (&struct serial_struct) to complete. + * + * Optional: called only if provided, otherwise %ENOTTY will be returned. + * Do not call this function directly, call tty_tiocgserial(). + * + * @set_serial: ``int ()(struct tty_struct *tty, struct serial_struct *p)`` + * + * Called when the @tty device receives a %TIOCSSERIAL ioctl. Passed a + * kernel structure @p (&struct serial_struct) to set the values from. + * + * Optional: called only if provided, otherwise %ENOTTY will be returned. + * Do not call this function directly, call tty_tiocsserial(). + * + * @show_fdinfo: ``void ()(struct tty_struct *tty, struct seq_file *m)`` + * + * Called when the @tty device file descriptor receives a fdinfo request + * from VFS (to show in /proc//fdinfo/). @m should be filled with + * information. + * + * Optional: called only if provided, otherwise nothing is written to @m. + * Do not call this function directly, call tty_show_fdinfo(). + * + * @poll_init: ``int ()(struct tty_driver *driver, int line, char *options)`` + * + * kgdboc support (Documentation/dev-tools/kgdb.rst). This routine is + * called to initialize the HW for later use by calling @poll_get_char or + * @poll_put_char. + * + * Optional: called only if provided, otherwise skipped as a non-polling + * driver. + * + * @poll_get_char: ``int ()(struct tty_driver *driver, int line)`` + * + * kgdboc support (see @poll_init). @driver should read a character from a + * tty identified by @line and return it. + * + * Optional: called only if @poll_init provided. * - * int (*get_icount)(struct tty_struct *tty, struct serial_icounter *icount); + * @poll_put_char: ``void ()(struct tty_driver *driver, int line, char ch)`` * - * Called when the device receives a TIOCGICOUNT ioctl. Passed a kernel - * structure to complete. This method is optional and will only be called - * if provided (otherwise ENOTTY will be returned). + * kgdboc support (see @poll_init). @driver should write character @ch to + * a tty identified by @line. + * + * Optional: called only if @poll_init provided. + * + * @proc_show: ``int ()(struct seq_file *m, void *driver)`` + * + * Driver @driver (cast to &struct tty_driver) can show additional info in + * /proc/tty/driver/. It is enough to fill in the information + * into @m. + * + * Optional: called only if provided, otherwise no /proc entry created. + * + * This structure defines the interface between the low-level tty driver and + * the tty routines. These routines can be defined. Unless noted otherwise, + * they are optional, and can be filled in with a %NULL pointer. */ - -#include -#include -#include -#include -#include -#include -#include - -struct tty_struct; -struct tty_driver; -struct serial_icounter_struct; -struct serial_struct; - struct tty_operations { struct tty_struct * (*lookup)(struct tty_driver *driver, struct file *filp, int idx); @@ -288,7 +372,7 @@ struct tty_operations { int (*poll_get_char)(struct tty_driver *driver, int line); void (*poll_put_char)(struct tty_driver *driver, int line, char ch); #endif - int (*proc_show)(struct seq_file *, void *); + int (*proc_show)(struct seq_file *m, void *driver); } __randomize_layout; /** -- cgit v1.2.3 From 630bf86d15778fd3e5df17cb6e00839d0f44a707 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 26 Nov 2021 09:15:53 +0100 Subject: tty: add kernel-doc for tty_port_operations tty_port_operations used to have only comments along its members. Convert them into proper kernel-doc comments in front of the structure. And add some more explanation to them where needed. Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20211126081611.11001-6-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_port.h | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h index 9091e1c8de4c..d3ea9ed0b98e 100644 --- a/include/linux/tty_port.h +++ b/include/linux/tty_port.h @@ -12,21 +12,28 @@ struct tty_driver; struct tty_port; struct tty_struct; +/** + * struct tty_port_operations -- operations on tty_port + * @carrier_raised: return 1 if the carrier is raised on @port + * @dtr_rts: raise the DTR line if @raise is nonzero, otherwise lower DTR + * @shutdown: called when the last close completes or a hangup finishes IFF the + * port was initialized. Do not use to free resources. Turn off the device + * only. Called under the port mutex to serialize against @activate and + * @shutdown. + * @activate: called under the port mutex from tty_port_open(), serialized using + * the port mutex. Supposed to turn on the device. + * + * FIXME: long term getting the tty argument *out* of this would be good + * for consoles. + * + * @destruct: called on the final put of a port. Free resources, possibly incl. + * the port itself. + */ struct tty_port_operations { - /* Return 1 if the carrier is raised */ int (*carrier_raised)(struct tty_port *port); - /* Control the DTR line */ void (*dtr_rts)(struct tty_port *port, int raise); - /* Called when the last close completes or a hangup finishes - IFF the port was initialized. Do not use to free resources. Called - under the port mutex to serialize against activate/shutdowns */ void (*shutdown)(struct tty_port *port); - /* Called under the port mutex from tty_port_open, serialized using - the port mutex */ - /* FIXME: long term getting the tty argument *out* of this would be - good for consoles */ int (*activate)(struct tty_port *port, struct tty_struct *tty); - /* Called on the final put of a port */ void (*destruct)(struct tty_port *port); }; -- cgit v1.2.3 From 0c6119f9f7dc03a53bd35ca5a77926eef3c33d10 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 26 Nov 2021 09:15:54 +0100 Subject: tty: add kernel-doc for tty_ldisc_ops tty_ldisc_ops structure was already partially documented in a standalone comment in the header beginning. Move it right before the structure and reformat it so it complies to kernel-doc. That way, we can include it in Documentation/ later in this series. And add the documentation for the members where missing too. Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20211126081611.11001-7-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_ldisc.h | 259 +++++++++++++++++++++++----------------------- 1 file changed, 130 insertions(+), 129 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h index 25f07017bbad..e0da0ba02de9 100644 --- a/include/linux/tty_ldisc.h +++ b/include/linux/tty_ldisc.h @@ -4,126 +4,6 @@ struct tty_struct; -/* - * This structure defines the interface between the tty line discipline - * implementation and the tty routines. The following routines can be - * defined; unless noted otherwise, they are optional, and can be - * filled in with a null pointer. - * - * int (*open)(struct tty_struct *); - * - * This function is called when the line discipline is associated - * with the tty. The line discipline can use this as an - * opportunity to initialize any state needed by the ldisc routines. - * - * void (*close)(struct tty_struct *); - * - * This function is called when the line discipline is being - * shutdown, either because the tty is being closed or because - * the tty is being changed to use a new line discipline - * - * void (*flush_buffer)(struct tty_struct *tty); - * - * This function instructs the line discipline to clear its - * buffers of any input characters it may have queued to be - * delivered to the user mode process. - * - * ssize_t (*read)(struct tty_struct * tty, struct file * file, - * unsigned char * buf, size_t nr); - * - * This function is called when the user requests to read from - * the tty. The line discipline will return whatever characters - * it has buffered up for the user. If this function is not - * defined, the user will receive an EIO error. - * - * ssize_t (*write)(struct tty_struct * tty, struct file * file, - * const unsigned char * buf, size_t nr); - * - * This function is called when the user requests to write to the - * tty. The line discipline will deliver the characters to the - * low-level tty device for transmission, optionally performing - * some processing on the characters first. If this function is - * not defined, the user will receive an EIO error. - * - * int (*ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg); - * - * This function is called when the user requests an ioctl which - * is not handled by the tty layer or the low-level tty driver. - * It is intended for ioctls which affect line discpline - * operation. Note that the search order for ioctls is (1) tty - * layer, (2) tty low-level driver, (3) line discpline. So a - * low-level driver can "grab" an ioctl request before the line - * discpline has a chance to see it. - * - * int (*compat_ioctl)(struct tty_struct *tty, unsigned int cmd, - * unsigned long arg); - * - * Process ioctl calls from 32-bit process on 64-bit system - * - * NOTE: only ioctls that are neither "pointer to compatible - * structure" nor tty-generic. Something private that takes - * an integer or a pointer to wordsize-sensitive structure - * belongs here, but most of ldiscs will happily leave - * it NULL. - * - * void (*set_termios)(struct tty_struct *tty, struct ktermios * old); - * - * This function notifies the line discpline that a change has - * been made to the termios structure. - * - * int (*poll)(struct tty_struct * tty, struct file * file, - * poll_table *wait); - * - * This function is called when a user attempts to select/poll on a - * tty device. It is solely the responsibility of the line - * discipline to handle poll requests. - * - * void (*receive_buf)(struct tty_struct *, const unsigned char *cp, - * char *fp, int count); - * - * This function is called by the low-level tty driver to send - * characters received by the hardware to the line discpline for - * processing. is a pointer to the buffer of input - * character received by the device. is a pointer to a - * pointer of flag bytes which indicate whether a character was - * received with a parity error, etc. may be NULL to indicate - * all data received is TTY_NORMAL. - * - * void (*write_wakeup)(struct tty_struct *); - * - * This function is called by the low-level tty driver to signal - * that line discpline should try to send more characters to the - * low-level driver for transmission. If the line discpline does - * not have any more data to send, it can just return. If the line - * discipline does have some data to send, please arise a tasklet - * or workqueue to do the real data transfer. Do not send data in - * this hook, it may leads to a deadlock. - * - * int (*hangup)(struct tty_struct *) - * - * Called on a hangup. Tells the discipline that it should - * cease I/O to the tty driver. Can sleep. The driver should - * seek to perform this action quickly but should wait until - * any pending driver I/O is completed. - * - * void (*dcd_change)(struct tty_struct *tty, unsigned int status) - * - * Tells the discipline that the DCD pin has changed its status. - * Used exclusively by the N_PPS (Pulse-Per-Second) line discipline. - * - * int (*receive_buf2)(struct tty_struct *, const unsigned char *cp, - * char *fp, int count); - * - * This function is called by the low-level tty driver to send - * characters received by the hardware to the line discpline for - * processing. is a pointer to the buffer of input - * character received by the device. is a pointer to a - * pointer of flag bytes which indicate whether a character was - * received with a parity error, etc. may be NULL to indicate - * all data received is TTY_NORMAL. - * If assigned, prefer this function for automatic flow control. - */ - #include #include #include @@ -175,7 +55,128 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass, ldsem_down_write(sem, timeout) #endif - +/** + * struct tty_ldisc_ops - ldisc operations + * + * @name: name of this ldisc rendered in /proc/tty/ldiscs + * @num: ``N_*`` number (%N_TTY, %N_HDLC, ...) reserved to this ldisc + * + * @open: ``int ()(struct tty_struct *tty)`` + * + * This function is called when the line discipline is associated with the + * @tty. The line discipline can use this as an opportunity to initialize + * any state needed by the ldisc routines. + * + * @close: ``void ()(struct tty_struct *tty)`` + * + * This function is called when the line discipline is being shutdown, + * either because the @tty is being closed or because the @tty is being + * changed to use a new line discipline + * + * @flush_buffer: ``void ()(struct tty_struct *tty)`` + * + * This function instructs the line discipline to clear its buffers of any + * input characters it may have queued to be delivered to the user mode + * process. + * + * @read: ``ssize_t ()(struct tty_struct *tty, struct file *file, + * unsigned char *buf, size_t nr)`` + * + * This function is called when the user requests to read from the @tty. + * The line discipline will return whatever characters it has buffered up + * for the user. If this function is not defined, the user will receive + * an %EIO error. + * + * @write: ``ssize_t ()(struct tty_struct *tty, struct file *file, + * const unsigned char *buf, size_t nr)`` + * + * This function is called when the user requests to write to the @tty. + * The line discipline will deliver the characters to the low-level tty + * device for transmission, optionally performing some processing on the + * characters first. If this function is not defined, the user will + * receive an %EIO error. + * + * @ioctl: ``int ()(struct tty_struct *tty, unsigned int cmd, + * unsigned long arg)`` + * + * This function is called when the user requests an ioctl which is not + * handled by the tty layer or the low-level tty driver. It is intended + * for ioctls which affect line discpline operation. Note that the search + * order for ioctls is (1) tty layer, (2) tty low-level driver, (3) line + * discpline. So a low-level driver can "grab" an ioctl request before + * the line discpline has a chance to see it. + * + * @compat_ioctl: ``int ()(struct tty_struct *tty, unsigned int cmd, + * unsigned long arg)`` + * + * Process ioctl calls from 32-bit process on 64-bit system. + * + * Note that only ioctls that are neither "pointer to compatible + * structure" nor tty-generic. Something private that takes an integer or + * a pointer to wordsize-sensitive structure belongs here, but most of + * ldiscs will happily leave it %NULL. + * + * @set_termios: ``void ()(struct tty_struct *tty, struct ktermios *old)`` + * + * This function notifies the line discpline that a change has been made + * to the termios structure. + * + * @poll: ``int ()(struct tty_struct *tty, struct file *file, + * struct poll_table_struct *wait)`` + * + * This function is called when a user attempts to select/poll on a @tty + * device. It is solely the responsibility of the line discipline to + * handle poll requests. + * + * @hangup: ``void ()(struct tty_struct *tty)`` + * + * Called on a hangup. Tells the discipline that it should cease I/O to + * the tty driver. Can sleep. The driver should seek to perform this + * action quickly but should wait until any pending driver I/O is + * completed. + * + * @receive_buf: ``void ()(struct tty_struct *tty, const unsigned char *cp, + * const char *fp, int count)`` + * + * This function is called by the low-level tty driver to send characters + * received by the hardware to the line discpline for processing. @cp is + * a pointer to the buffer of input character received by the device. @fp + * is a pointer to an array of flag bytes which indicate whether a + * character was received with a parity error, etc. @fp may be %NULL to + * indicate all data received is %TTY_NORMAL. + * + * @write_wakeup: ``void ()(struct tty_struct *tty)`` + * + * This function is called by the low-level tty driver to signal that line + * discpline should try to send more characters to the low-level driver + * for transmission. If the line discpline does not have any more data to + * send, it can just return. If the line discipline does have some data to + * send, please arise a tasklet or workqueue to do the real data transfer. + * Do not send data in this hook, it may lead to a deadlock. + * + * @dcd_change: ``void ()(struct tty_struct *tty, unsigned int status)`` + * + * Tells the discipline that the DCD pin has changed its status. Used + * exclusively by the %N_PPS (Pulse-Per-Second) line discipline. + * + * @receive_buf2: ``int ()(struct tty_struct *tty, const unsigned char *cp, + * const char *fp, int count)`` + * + * This function is called by the low-level tty driver to send characters + * received by the hardware to the line discpline for processing. @cp is a + * pointer to the buffer of input character received by the device. @fp + * is a pointer to an array of flag bytes which indicate whether a + * character was received with a parity error, etc. @fp may be %NULL to + * indicate all data received is %TTY_NORMAL. If assigned, prefer this + * function for automatic flow control. + * + * @owner: module containting this ldisc (for reference counting) + * + * This structure defines the interface between the tty line discipline + * implementation and the tty routines. The above routines can be defined. + * Unless noted otherwise, they are optional, and can be filled in with a %NULL + * pointer. + */ struct tty_ldisc_ops { char *name; int num; @@ -183,8 +184,8 @@ struct tty_ldisc_ops { /* * The following routines are called from above. */ - int (*open)(struct tty_struct *); - void (*close)(struct tty_struct *); + int (*open)(struct tty_struct *tty); + void (*close)(struct tty_struct *tty); void (*flush_buffer)(struct tty_struct *tty); ssize_t (*read)(struct tty_struct *tty, struct file *file, unsigned char *buf, size_t nr, @@ -196,18 +197,18 @@ struct tty_ldisc_ops { int (*compat_ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg); void (*set_termios)(struct tty_struct *tty, struct ktermios *old); - __poll_t (*poll)(struct tty_struct *, struct file *, - struct poll_table_struct *); + __poll_t (*poll)(struct tty_struct *tty, struct file *file, + struct poll_table_struct *wait); void (*hangup)(struct tty_struct *tty); /* * The following routines are called from below. */ - void (*receive_buf)(struct tty_struct *, const unsigned char *cp, + void (*receive_buf)(struct tty_struct *tty, const unsigned char *cp, const char *fp, int count); - void (*write_wakeup)(struct tty_struct *); - void (*dcd_change)(struct tty_struct *, unsigned int); - int (*receive_buf2)(struct tty_struct *, const unsigned char *cp, + void (*write_wakeup)(struct tty_struct *tty); + void (*dcd_change)(struct tty_struct *tty, unsigned int status); + int (*receive_buf2)(struct tty_struct *tty, const unsigned char *cp, const char *fp, int count); struct module *owner; -- cgit v1.2.3 From 29d5ef685948369602ccd5c04d2a215449c4b943 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 26 Nov 2021 09:15:55 +0100 Subject: tty: combine tty_operations triple docs into kernel-doc In Documentation/driver-api/serial/tty.rst, there are triplicated texts about some struct tty_operations' hooks. Combine them into existing kernel-doc comments of struct tty_operations and drop them from the Documentation/. Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20211126081611.11001-8-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_driver.h | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 5611992ab26a..41274d551e28 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -47,15 +47,17 @@ struct serial_struct; * routine is mandatory; if this routine is not filled in, the attempted * open will fail with %ENODEV. * - * Required method. Called with tty lock held. + * Required method. Called with tty lock held. May sleep. * * @close: ``void ()(struct tty_struct *tty, struct file *)`` * - * This routine is called when a particular @tty device is closed. + * This routine is called when a particular @tty device is closed. At the + * point of return from this call the driver must make no further ldisc + * calls of any kind. * * Remark: called even if the corresponding @open() failed. * - * Required method. Called with tty lock held. + * Required method. Called with tty lock held. May sleep. * * @shutdown: ``void ()(struct tty_struct *tty)`` * @@ -77,7 +79,10 @@ struct serial_struct; * user space or kernel space. This routine will return the * number of characters actually accepted for writing. * - * Optional: Required for writable devices. + * May occur in parallel in special cases. Because this includes panic + * paths drivers generally shouldn't try and do clever locking here. + * + * Optional: Required for writable devices. May not sleep. * * @put_char: ``int ()(struct tty_struct *tty, unsigned char ch)`` * @@ -105,6 +110,9 @@ struct serial_struct; * to change as output buffers get emptied, or if the output flow * control is acted. * + * The ldisc is responsible for being intelligent about multi-threading of + * write_room/write calls + * * Required if @write method is provided else not needed. Do not call this * function directly, call tty_write_room() * @@ -136,14 +144,21 @@ struct serial_struct; * @set_termios: ``void ()(struct tty_struct *tty, struct ktermios *old)`` * * This routine allows the @tty driver to be notified when device's - * termios settings have changed. + * termios settings have changed. New settings are in @tty->termios. + * Previous settings are passed in the @old argument. * - * Optional: Called under the @tty->termios_rwsem. + * The API is defined such that the driver should return the actual modes + * selected. This means that the driver is responsible for modifying any + * bits in @tty->termios it cannot fulfill to indicate the actual modes + * being used. + * + * Optional. Called under the @tty->termios_rwsem. May sleep. * * @set_ldisc: ``void ()(struct tty_struct *tty)`` * * This routine allows the @tty driver to be notified when the device's - * line discipline is being changed. + * line discipline is being changed. At the point this is done the + * discipline is not yet usable. * * Optional. Called under the @tty->ldisc_sem and @tty->termios_rwsem. * @@ -153,6 +168,9 @@ struct serial_struct; * discipline are close to full, and it should somehow signal that no more * characters should be sent to the @tty. * + * Serialization including with @unthrottle() is the job of the ldisc + * layer. + * * Optional: Always invoke via tty_throttle_safe(). Called under the * @tty->termios_rwsem. * @@ -204,7 +222,7 @@ struct serial_struct; * hardware is expected to do the delay work itself. 0 and -1 are still * used for on/off. * - * Optional: Required for %TCSBRK/%BRKP/etc. handling. + * Optional: Required for %TCSBRK/%BRKP/etc. handling. May sleep. * * @flush_buffer: ``void ()(struct tty_struct *tty)`` * @@ -222,7 +240,7 @@ struct serial_struct; * reached. * * Optional: If not provided, the device is assumed to have no FIFO. - * Usually correct to invoke via tty_wait_until_sent(). + * Usually correct to invoke via tty_wait_until_sent(). May sleep. * * @send_xchar: ``void ()(struct tty_struct *tty, char ch)`` * -- cgit v1.2.3 From 40f4268cddb93d17a11579920d940c2dca8b9445 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 26 Nov 2021 09:15:56 +0100 Subject: tty: combine tty_ldisc_ops docs into kernel-doc In Documentation/driver-api/serial/tty.rst, there are duplicated texts about some struct tty_ldisc_ops' hooks. Combine them into existing kernel-doc comments of struct tty_ldisc_ops and drop them from the Documentation/. Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20211126081611.11001-9-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_ldisc.h | 67 ++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h index e0da0ba02de9..e85002b56752 100644 --- a/include/linux/tty_ldisc.h +++ b/include/linux/tty_ldisc.h @@ -61,33 +61,45 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass, * @name: name of this ldisc rendered in /proc/tty/ldiscs * @num: ``N_*`` number (%N_TTY, %N_HDLC, ...) reserved to this ldisc * - * @open: ``int ()(struct tty_struct *tty)`` + * @open: [TTY] ``int ()(struct tty_struct *tty)`` * * This function is called when the line discipline is associated with the - * @tty. The line discipline can use this as an opportunity to initialize - * any state needed by the ldisc routines. + * @tty. No other call into the line discipline for this tty will occur + * until it completes successfully. It should initialize any state needed + * by the ldisc, and set @tty->receive_room to the maximum amount of data + * the line discipline is willing to accept from the driver with a single + * call to @receive_buf(). Returning an error will prevent the ldisc from + * being attached. * - * @close: ``void ()(struct tty_struct *tty)`` + * Can sleep. + * + * @close: [TTY] ``void ()(struct tty_struct *tty)`` * * This function is called when the line discipline is being shutdown, * either because the @tty is being closed or because the @tty is being - * changed to use a new line discipline + * changed to use a new line discipline. At the point of execution no + * further users will enter the ldisc code for this tty. + * + * Can sleep. * - * @flush_buffer: ``void ()(struct tty_struct *tty)`` + * @flush_buffer: [TTY] ``void ()(struct tty_struct *tty)`` * * This function instructs the line discipline to clear its buffers of any * input characters it may have queued to be delivered to the user mode - * process. + * process. It may be called at any point between open and close. * - * @read: ``ssize_t ()(struct tty_struct *tty, struct file *file, + * @read: [TTY] ``ssize_t ()(struct tty_struct *tty, struct file *file, * unsigned char *buf, size_t nr)`` * * This function is called when the user requests to read from the @tty. * The line discipline will return whatever characters it has buffered up * for the user. If this function is not defined, the user will receive - * an %EIO error. + * an %EIO error. Multiple read calls may occur in parallel and the ldisc + * must deal with serialization issues. + * + * Can sleep. * - * @write: ``ssize_t ()(struct tty_struct *tty, struct file *file, + * @write: [TTY] ``ssize_t ()(struct tty_struct *tty, struct file *file, * const unsigned char *buf, size_t nr)`` * * This function is called when the user requests to write to the @tty. @@ -96,7 +108,9 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass, * characters first. If this function is not defined, the user will * receive an %EIO error. * - * @ioctl: ``int ()(struct tty_struct *tty, unsigned int cmd, + * Can sleep. + * + * @ioctl: [TTY] ``int ()(struct tty_struct *tty, unsigned int cmd, * unsigned long arg)`` * * This function is called when the user requests an ioctl which is not @@ -106,7 +120,7 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass, * discpline. So a low-level driver can "grab" an ioctl request before * the line discpline has a chance to see it. * - * @compat_ioctl: ``int ()(struct tty_struct *tty, unsigned int cmd, + * @compat_ioctl: [TTY] ``int ()(struct tty_struct *tty, unsigned int cmd, * unsigned long arg)`` * * Process ioctl calls from 32-bit process on 64-bit system. @@ -116,27 +130,29 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass, * a pointer to wordsize-sensitive structure belongs here, but most of * ldiscs will happily leave it %NULL. * - * @set_termios: ``void ()(struct tty_struct *tty, struct ktermios *old)`` + * @set_termios: [TTY] ``void ()(struct tty_struct *tty, struct ktermios *old)`` * * This function notifies the line discpline that a change has been made * to the termios structure. * - * @poll: ``int ()(struct tty_struct *tty, struct file *file, + * @poll: [TTY] ``int ()(struct tty_struct *tty, struct file *file, * struct poll_table_struct *wait)`` * * This function is called when a user attempts to select/poll on a @tty * device. It is solely the responsibility of the line discipline to * handle poll requests. * - * @hangup: ``void ()(struct tty_struct *tty)`` + * @hangup: [TTY] ``void ()(struct tty_struct *tty)`` * * Called on a hangup. Tells the discipline that it should cease I/O to - * the tty driver. Can sleep. The driver should seek to perform this - * action quickly but should wait until any pending driver I/O is - * completed. + * the tty driver. The driver should seek to perform this action quickly + * but should wait until any pending driver I/O is completed. No further + * calls into the ldisc code will occur. + * + * Can sleep. * - * @receive_buf: ``void ()(struct tty_struct *tty, const unsigned char *cp, - * const char *fp, int count)`` + * @receive_buf: [DRV] ``void ()(struct tty_struct *tty, + * const unsigned char *cp, const char *fp, int count)`` * * This function is called by the low-level tty driver to send characters * received by the hardware to the line discpline for processing. @cp is @@ -145,7 +161,7 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass, * character was received with a parity error, etc. @fp may be %NULL to * indicate all data received is %TTY_NORMAL. * - * @write_wakeup: ``void ()(struct tty_struct *tty)`` + * @write_wakeup: [DRV] ``void ()(struct tty_struct *tty)`` * * This function is called by the low-level tty driver to signal that line * discpline should try to send more characters to the low-level driver @@ -154,13 +170,13 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass, * send, please arise a tasklet or workqueue to do the real data transfer. * Do not send data in this hook, it may lead to a deadlock. * - * @dcd_change: ``void ()(struct tty_struct *tty, unsigned int status)`` + * @dcd_change: [DRV] ``void ()(struct tty_struct *tty, unsigned int status)`` * * Tells the discipline that the DCD pin has changed its status. Used * exclusively by the %N_PPS (Pulse-Per-Second) line discipline. * - * @receive_buf2: ``int ()(struct tty_struct *tty, const unsigned char *cp, - * const char *fp, int count)`` + * @receive_buf2: [DRV] ``int ()(struct tty_struct *tty, + * const unsigned char *cp, const char *fp, int count)`` * * This function is called by the low-level tty driver to send characters * received by the hardware to the line discpline for processing. @cp is a @@ -176,6 +192,9 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass, * implementation and the tty routines. The above routines can be defined. * Unless noted otherwise, they are optional, and can be filled in with a %NULL * pointer. + * + * Hooks marked [TTY] are invoked from the TTY core, the [DRV] ones from the + * tty_driver side. */ struct tty_ldisc_ops { char *name; -- cgit v1.2.3 From 4072254f96f954ec0d34899f15d987803b6d76a2 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 26 Nov 2021 09:15:57 +0100 Subject: tty: reformat tty_struct::flags into kernel-doc Move the partial tty_struct::flags documentation from tty_ldisc to the tty.h header and combine it with the one-liners present there. Convert all those to kernel-doc. This way, we can simply reference the documentation in Documentation while the text is still along the definitions. Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20211126081611.11001-10-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 74 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index da49ad9be281..7b0a5d478ef6 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -263,26 +263,72 @@ struct tty_file_private { /* tty magic number */ #define TTY_MAGIC 0x5401 -/* - * These bits are used in the flags field of the tty structure. +/** + * DOC: TTY Struct Flags + * + * These bits are used in the :c:member:`tty_struct.flags` field. * * So that interrupts won't be able to mess up the queues, * copy_to_cooked must be atomic with respect to itself, as must * tty->write. Thus, you must use the inline functions set_bit() and * clear_bit() to make things atomic. + * + * TTY_THROTTLED + * Driver input is throttled. The ldisc should call + * :c:member:`tty_driver.unthrottle()` in order to resume reception when + * it is ready to process more data (at threshold min). + * + * TTY_IO_ERROR + * If set, causes all subsequent userspace read/write calls on the tty to + * fail, returning -%EIO. (May be no ldisc too.) + * + * TTY_OTHER_CLOSED + * Device is a pty and the other side has closed. + * + * TTY_EXCLUSIVE + * Exclusive open mode (a single opener). + * + * TTY_DO_WRITE_WAKEUP + * If set, causes the driver to call the + * :c:member:`tty_ldisc_ops.write_wakeup()` method in order to resume + * transmission when it can accept more data to transmit. + * + * TTY_LDISC_OPEN + * Indicates that a line discipline is open. For debugging purposes only. + * + * TTY_PTY_LOCK + * A flag private to pty code to implement %TIOCSPTLCK/%TIOCGPTLCK logic. + * + * TTY_NO_WRITE_SPLIT + * Prevent driver from splitting up writes into smaller chunks (preserve + * write boundaries to driver). + * + * TTY_HUPPED + * The TTY was hung up. This is set post :c:member:`tty_driver.hangup()`. + * + * TTY_HUPPING + * The TTY is in the process of hanging up to abort potential readers. + * + * TTY_LDISC_CHANGING + * Line discipline for this TTY is being changed. I/O should not block + * when this is set. Use tty_io_nonblock() to check. + * + * TTY_LDISC_HALTED + * Line discipline for this TTY was stopped. No work should be queued to + * this ldisc. */ -#define TTY_THROTTLED 0 /* Call unthrottle() at threshold min */ -#define TTY_IO_ERROR 1 /* Cause an I/O error (may be no ldisc too) */ -#define TTY_OTHER_CLOSED 2 /* Other side (if any) has closed */ -#define TTY_EXCLUSIVE 3 /* Exclusive open mode */ -#define TTY_DO_WRITE_WAKEUP 5 /* Call write_wakeup after queuing new */ -#define TTY_LDISC_OPEN 11 /* Line discipline is open */ -#define TTY_PTY_LOCK 16 /* pty private */ -#define TTY_NO_WRITE_SPLIT 17 /* Preserve write boundaries to driver */ -#define TTY_HUPPED 18 /* Post driver->hangup() */ -#define TTY_HUPPING 19 /* Hangup in progress */ -#define TTY_LDISC_CHANGING 20 /* Change pending - non-block IO */ -#define TTY_LDISC_HALTED 22 /* Line discipline is halted */ +#define TTY_THROTTLED 0 +#define TTY_IO_ERROR 1 +#define TTY_OTHER_CLOSED 2 +#define TTY_EXCLUSIVE 3 +#define TTY_DO_WRITE_WAKEUP 5 +#define TTY_LDISC_OPEN 11 +#define TTY_PTY_LOCK 16 +#define TTY_NO_WRITE_SPLIT 17 +#define TTY_HUPPED 18 +#define TTY_HUPPING 19 +#define TTY_LDISC_CHANGING 20 +#define TTY_LDISC_HALTED 22 static inline bool tty_io_nonblock(struct tty_struct *tty, struct file *file) { -- cgit v1.2.3 From 34d809f8b4ff68f63e8d7f71d93d150382c6bb8b Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 26 Nov 2021 09:15:58 +0100 Subject: tty: reformat TTY_DRIVER_ flags into kernel-doc We want to reference TTY_DRIVER_* flags in Documentation/ later in this series. But the current documentation in the TTY_DRIVER_*'s header does not allow that. Reformat it to kernel-doc using "DOC" directive and line-feeds, so that we can include it as it is. Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20211126081611.11001-11-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_driver.h | 90 ++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 41274d551e28..4841d8069c07 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -492,49 +492,53 @@ static inline void tty_set_operations(struct tty_driver *driver, /* tty driver magic number */ #define TTY_DRIVER_MAGIC 0x5402 -/* - * tty driver flags - * - * TTY_DRIVER_RESET_TERMIOS --- requests the tty layer to reset the - * termios setting when the last process has closed the device. - * Used for PTY's, in particular. - * - * TTY_DRIVER_REAL_RAW --- if set, indicates that the driver will - * guarantee never to set any special character handling - * flags if ((IGNBRK || (!BRKINT && !PARMRK)) && (IGNPAR || - * !INPCK)). That is, if there is no reason for the driver to - * send notifications of parity and break characters up to the - * line driver, it won't do so. This allows the line driver to - * optimize for this case if this flag is set. (Note that there - * is also a promise, if the above case is true, not to signal - * overruns, either.) - * - * TTY_DRIVER_DYNAMIC_DEV --- if set, the individual tty devices need - * to be registered with a call to tty_register_device() when the - * device is found in the system and unregistered with a call to - * tty_unregister_device() so the devices will be show up - * properly in sysfs. If not set, driver->num entries will be - * created by the tty core in sysfs when tty_register_driver() is - * called. This is to be used by drivers that have tty devices - * that can appear and disappear while the main tty driver is - * registered with the tty core. - * - * TTY_DRIVER_DEVPTS_MEM -- don't use the standard arrays, instead - * use dynamic memory keyed through the devpts filesystem. This - * is only applicable to the pty driver. - * - * TTY_DRIVER_HARDWARE_BREAK -- hardware handles break signals. Pass - * the requested timeout to the caller instead of using a simple - * on/off interface. - * - * TTY_DRIVER_DYNAMIC_ALLOC -- do not allocate structures which are - * needed per line for this driver as it would waste memory. - * The driver will take care. - * - * TTY_DRIVER_UNNUMBERED_NODE -- do not create numbered /dev nodes. In - * other words create /dev/ttyprintk and not /dev/ttyprintk0. - * Applicable only when a driver for a single tty device is - * being allocated. +/** + * DOC: TTY Driver Flags + * + * TTY_DRIVER_RESET_TERMIOS + * Requests the tty layer to reset the termios setting when the last + * process has closed the device. Used for PTYs, in particular. + * + * TTY_DRIVER_REAL_RAW + * Indicates that the driver will guarantee not to set any special + * character handling flags if this is set for the tty: + * + * ``(IGNBRK || (!BRKINT && !PARMRK)) && (IGNPAR || !INPCK)`` + * + * That is, if there is no reason for the driver to + * send notifications of parity and break characters up to the line + * driver, it won't do so. This allows the line driver to optimize for + * this case if this flag is set. (Note that there is also a promise, if + * the above case is true, not to signal overruns, either.) + * + * TTY_DRIVER_DYNAMIC_DEV + * The individual tty devices need to be registered with a call to + * tty_register_device() when the device is found in the system and + * unregistered with a call to tty_unregister_device() so the devices will + * be show up properly in sysfs. If not set, all &tty_driver.num entries + * will be created by the tty core in sysfs when tty_register_driver() is + * called. This is to be used by drivers that have tty devices that can + * appear and disappear while the main tty driver is registered with the + * tty core. + * + * TTY_DRIVER_DEVPTS_MEM + * Don't use the standard arrays (&tty_driver.ttys and + * &tty_driver.termios), instead use dynamic memory keyed through the + * devpts filesystem. This is only applicable to the PTY driver. + * + * TTY_DRIVER_HARDWARE_BREAK + * Hardware handles break signals. Pass the requested timeout to the + * &tty_operations.break_ctl instead of using a simple on/off interface. + * + * TTY_DRIVER_DYNAMIC_ALLOC + * Do not allocate structures which are needed per line for this driver + * (&tty_driver.ports) as it would waste memory. The driver will take + * care. This is only applicable to the PTY driver. + * + * TTY_DRIVER_UNNUMBERED_NODE + * Do not create numbered ``/dev`` nodes. For example, create + * ``/dev/ttyprintk`` and not ``/dev/ttyprintk0``. Applicable only when a + * driver for a single tty device is being allocated. */ #define TTY_DRIVER_INSTALLED 0x0001 #define TTY_DRIVER_RESET_TERMIOS 0x0002 -- cgit v1.2.3 From df0e68c1e9945e2ee86d266ce45597bbd8299b06 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Wed, 17 Nov 2021 12:05:59 +0000 Subject: comedi: Move the main COMEDI headers Move the main COMEDI driver headers out of "drivers/comedi/" into new directory "include/linux/comedi/". These are "comedidev.h", "comedilib.h", "comedi_pci.h", "comedi_pcmcia.h", and "comedi_usb.h". Additionally, move the user-space API header "comedi.h" into "include/uapi/linux/" and add "WITH Linux-syscall-note" to its SPDX-License-Identifier. Update the "COMEDI DRIVERS" section of the MAINTAINERS file to account for these changes. Signed-off-by: Ian Abbott Link: https://lore.kernel.org/r/20211117120604.117740-2-abbotti@mev.co.uk Signed-off-by: Greg Kroah-Hartman --- include/linux/comedi/comedi_pci.h | 56 ++ include/linux/comedi/comedi_pcmcia.h | 48 ++ include/linux/comedi/comedi_usb.h | 41 ++ include/linux/comedi/comedidev.h | 1053 ++++++++++++++++++++++++++++++++++ include/linux/comedi/comedilib.h | 26 + 5 files changed, 1224 insertions(+) create mode 100644 include/linux/comedi/comedi_pci.h create mode 100644 include/linux/comedi/comedi_pcmcia.h create mode 100644 include/linux/comedi/comedi_usb.h create mode 100644 include/linux/comedi/comedidev.h create mode 100644 include/linux/comedi/comedilib.h (limited to 'include/linux') diff --git a/include/linux/comedi/comedi_pci.h b/include/linux/comedi/comedi_pci.h new file mode 100644 index 000000000000..2fb50663e3ed --- /dev/null +++ b/include/linux/comedi/comedi_pci.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * comedi_pci.h + * header file for Comedi PCI drivers + * + * COMEDI - Linux Control and Measurement Device Interface + * Copyright (C) 1997-2000 David A. Schleef + */ + +#ifndef _COMEDI_PCI_H +#define _COMEDI_PCI_H + +#include +#include + +/* + * PCI Vendor IDs not in + */ +#define PCI_VENDOR_ID_KOLTER 0x1001 +#define PCI_VENDOR_ID_ICP 0x104c +#define PCI_VENDOR_ID_DT 0x1116 +#define PCI_VENDOR_ID_IOTECH 0x1616 +#define PCI_VENDOR_ID_CONTEC 0x1221 +#define PCI_VENDOR_ID_RTD 0x1435 +#define PCI_VENDOR_ID_HUMUSOFT 0x186c + +struct pci_dev *comedi_to_pci_dev(struct comedi_device *dev); + +int comedi_pci_enable(struct comedi_device *dev); +void comedi_pci_disable(struct comedi_device *dev); +void comedi_pci_detach(struct comedi_device *dev); + +int comedi_pci_auto_config(struct pci_dev *pcidev, struct comedi_driver *driver, + unsigned long context); +void comedi_pci_auto_unconfig(struct pci_dev *pcidev); + +int comedi_pci_driver_register(struct comedi_driver *comedi_driver, + struct pci_driver *pci_driver); +void comedi_pci_driver_unregister(struct comedi_driver *comedi_driver, + struct pci_driver *pci_driver); + +/** + * module_comedi_pci_driver() - Helper macro for registering a comedi PCI driver + * @__comedi_driver: comedi_driver struct + * @__pci_driver: pci_driver struct + * + * Helper macro for comedi PCI drivers which do not do anything special + * in module init/exit. This eliminates a lot of boilerplate. Each + * module may only use this macro once, and calling it replaces + * module_init() and module_exit() + */ +#define module_comedi_pci_driver(__comedi_driver, __pci_driver) \ + module_driver(__comedi_driver, comedi_pci_driver_register, \ + comedi_pci_driver_unregister, &(__pci_driver)) + +#endif /* _COMEDI_PCI_H */ diff --git a/include/linux/comedi/comedi_pcmcia.h b/include/linux/comedi/comedi_pcmcia.h new file mode 100644 index 000000000000..a33dfb65b869 --- /dev/null +++ b/include/linux/comedi/comedi_pcmcia.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * comedi_pcmcia.h + * header file for Comedi PCMCIA drivers + * + * COMEDI - Linux Control and Measurement Device Interface + * Copyright (C) 1997-2000 David A. Schleef + */ + +#ifndef _COMEDI_PCMCIA_H +#define _COMEDI_PCMCIA_H + +#include +#include +#include + +struct pcmcia_device *comedi_to_pcmcia_dev(struct comedi_device *dev); + +int comedi_pcmcia_enable(struct comedi_device *dev, + int (*conf_check)(struct pcmcia_device *p_dev, + void *priv_data)); +void comedi_pcmcia_disable(struct comedi_device *dev); + +int comedi_pcmcia_auto_config(struct pcmcia_device *link, + struct comedi_driver *driver); +void comedi_pcmcia_auto_unconfig(struct pcmcia_device *link); + +int comedi_pcmcia_driver_register(struct comedi_driver *comedi_driver, + struct pcmcia_driver *pcmcia_driver); +void comedi_pcmcia_driver_unregister(struct comedi_driver *comedi_driver, + struct pcmcia_driver *pcmcia_driver); + +/** + * module_comedi_pcmcia_driver() - Helper macro for registering a comedi + * PCMCIA driver + * @__comedi_driver: comedi_driver struct + * @__pcmcia_driver: pcmcia_driver struct + * + * Helper macro for comedi PCMCIA drivers which do not do anything special + * in module init/exit. This eliminates a lot of boilerplate. Each + * module may only use this macro once, and calling it replaces + * module_init() and module_exit() + */ +#define module_comedi_pcmcia_driver(__comedi_driver, __pcmcia_driver) \ + module_driver(__comedi_driver, comedi_pcmcia_driver_register, \ + comedi_pcmcia_driver_unregister, &(__pcmcia_driver)) + +#endif /* _COMEDI_PCMCIA_H */ diff --git a/include/linux/comedi/comedi_usb.h b/include/linux/comedi/comedi_usb.h new file mode 100644 index 000000000000..5d17dd425bd2 --- /dev/null +++ b/include/linux/comedi/comedi_usb.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* comedi_usb.h + * header file for USB Comedi drivers + * + * COMEDI - Linux Control and Measurement Device Interface + * Copyright (C) 1997-2000 David A. Schleef + */ + +#ifndef _COMEDI_USB_H +#define _COMEDI_USB_H + +#include +#include + +struct usb_interface *comedi_to_usb_interface(struct comedi_device *dev); +struct usb_device *comedi_to_usb_dev(struct comedi_device *dev); + +int comedi_usb_auto_config(struct usb_interface *intf, + struct comedi_driver *driver, unsigned long context); +void comedi_usb_auto_unconfig(struct usb_interface *intf); + +int comedi_usb_driver_register(struct comedi_driver *comedi_driver, + struct usb_driver *usb_driver); +void comedi_usb_driver_unregister(struct comedi_driver *comedi_driver, + struct usb_driver *usb_driver); + +/** + * module_comedi_usb_driver() - Helper macro for registering a comedi USB driver + * @__comedi_driver: comedi_driver struct + * @__usb_driver: usb_driver struct + * + * Helper macro for comedi USB drivers which do not do anything special + * in module init/exit. This eliminates a lot of boilerplate. Each + * module may only use this macro once, and calling it replaces + * module_init() and module_exit() + */ +#define module_comedi_usb_driver(__comedi_driver, __usb_driver) \ + module_driver(__comedi_driver, comedi_usb_driver_register, \ + comedi_usb_driver_unregister, &(__usb_driver)) + +#endif /* _COMEDI_USB_H */ diff --git a/include/linux/comedi/comedidev.h b/include/linux/comedi/comedidev.h new file mode 100644 index 000000000000..0a1150900ef3 --- /dev/null +++ b/include/linux/comedi/comedidev.h @@ -0,0 +1,1053 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * comedidev.h + * header file for kernel-only structures, variables, and constants + * + * COMEDI - Linux Control and Measurement Device Interface + * Copyright (C) 1997-2000 David A. Schleef + */ + +#ifndef _COMEDIDEV_H +#define _COMEDIDEV_H + +#include +#include +#include +#include +#include +#include + +#define COMEDI_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + (c)) +#define COMEDI_VERSION_CODE COMEDI_VERSION(COMEDI_MAJORVERSION, \ + COMEDI_MINORVERSION, COMEDI_MICROVERSION) +#define COMEDI_RELEASE VERSION + +#define COMEDI_NUM_BOARD_MINORS 0x30 + +/** + * struct comedi_subdevice - Working data for a COMEDI subdevice + * @device: COMEDI device to which this subdevice belongs. (Initialized by + * comedi_alloc_subdevices().) + * @index: Index of this subdevice within device's array of subdevices. + * (Initialized by comedi_alloc_subdevices().) + * @type: Type of subdevice from &enum comedi_subdevice_type. (Initialized by + * the low-level driver.) + * @n_chan: Number of channels the subdevice supports. (Initialized by the + * low-level driver.) + * @subdev_flags: Various "SDF" flags indicating aspects of the subdevice to + * the COMEDI core and user application. (Initialized by the low-level + * driver.) + * @len_chanlist: Maximum length of a channel list if the subdevice supports + * asynchronous acquisition commands. (Optionally initialized by the + * low-level driver, or changed from 0 to 1 during post-configuration.) + * @private: Private data pointer which is either set by the low-level driver + * itself, or by a call to comedi_alloc_spriv() which allocates storage. + * In the latter case, the storage is automatically freed after the + * low-level driver's "detach" handler is called for the device. + * (Initialized by the low-level driver.) + * @async: Pointer to &struct comedi_async id the subdevice supports + * asynchronous acquisition commands. (Allocated and initialized during + * post-configuration if needed.) + * @lock: Pointer to a file object that performed a %COMEDI_LOCK ioctl on the + * subdevice. (Initially NULL.) + * @busy: Pointer to a file object that is performing an asynchronous + * acquisition command on the subdevice. (Initially NULL.) + * @runflags: Internal flags for use by COMEDI core, mostly indicating whether + * an asynchronous acquisition command is running. + * @spin_lock: Generic spin-lock for use by the COMEDI core and the low-level + * driver. (Initialized by comedi_alloc_subdevices().) + * @io_bits: Bit-mask indicating the channel directions for a DIO subdevice + * with no more than 32 channels. A '1' at a bit position indicates the + * corresponding channel is configured as an output. (Initialized by the + * low-level driver for a DIO subdevice. Forced to all-outputs during + * post-configuration for a digital output subdevice.) + * @maxdata: If non-zero, this is the maximum raw data value of each channel. + * If zero, the maximum data value is channel-specific. (Initialized by + * the low-level driver.) + * @maxdata_list: If the maximum data value is channel-specific, this points + * to an array of maximum data values indexed by channel index. + * (Initialized by the low-level driver.) + * @range_table: If non-NULL, this points to a COMEDI range table for the + * subdevice. If NULL, the range table is channel-specific. (Initialized + * by the low-level driver, will be set to an "invalid" range table during + * post-configuration if @range_table and @range_table_list are both + * NULL.) + * @range_table_list: If the COMEDI range table is channel-specific, this + * points to an array of pointers to COMEDI range tables indexed by + * channel number. (Initialized by the low-level driver.) + * @chanlist: Not used. + * @insn_read: Optional pointer to a handler for the %INSN_READ instruction. + * (Initialized by the low-level driver, or set to a default handler + * during post-configuration.) + * @insn_write: Optional pointer to a handler for the %INSN_WRITE instruction. + * (Initialized by the low-level driver, or set to a default handler + * during post-configuration.) + * @insn_bits: Optional pointer to a handler for the %INSN_BITS instruction + * for a digital input, digital output or digital input/output subdevice. + * (Initialized by the low-level driver, or set to a default handler + * during post-configuration.) + * @insn_config: Optional pointer to a handler for the %INSN_CONFIG + * instruction. (Initialized by the low-level driver, or set to a default + * handler during post-configuration.) + * @do_cmd: If the subdevice supports asynchronous acquisition commands, this + * points to a handler to set it up in hardware. (Initialized by the + * low-level driver.) + * @do_cmdtest: If the subdevice supports asynchronous acquisition commands, + * this points to a handler used to check and possibly tweak a prospective + * acquisition command without setting it up in hardware. (Initialized by + * the low-level driver.) + * @poll: If the subdevice supports asynchronous acquisition commands, this + * is an optional pointer to a handler for the %COMEDI_POLL ioctl which + * instructs the low-level driver to synchronize buffers. (Initialized by + * the low-level driver if needed.) + * @cancel: If the subdevice supports asynchronous acquisition commands, this + * points to a handler used to terminate a running command. (Initialized + * by the low-level driver.) + * @buf_change: If the subdevice supports asynchronous acquisition commands, + * this is an optional pointer to a handler that is called when the data + * buffer for handling asynchronous commands is allocated or reallocated. + * (Initialized by the low-level driver if needed.) + * @munge: If the subdevice supports asynchronous acquisition commands and + * uses DMA to transfer data from the hardware to the acquisition buffer, + * this points to a function used to "munge" the data values from the + * hardware into the format expected by COMEDI. (Initialized by the + * low-level driver if needed.) + * @async_dma_dir: If the subdevice supports asynchronous acquisition commands + * and uses DMA to transfer data from the hardware to the acquisition + * buffer, this sets the DMA direction for the buffer. (initialized to + * %DMA_NONE by comedi_alloc_subdevices() and changed by the low-level + * driver if necessary.) + * @state: Handy bit-mask indicating the output states for a DIO or digital + * output subdevice with no more than 32 channels. (Initialized by the + * low-level driver.) + * @class_dev: If the subdevice supports asynchronous acquisition commands, + * this points to a sysfs comediX_subdY device where X is the minor device + * number of the COMEDI device and Y is the subdevice number. The minor + * device number for the sysfs device is allocated dynamically in the + * range 48 to 255. This is used to allow the COMEDI device to be opened + * with a different default read or write subdevice. (Allocated during + * post-configuration if needed.) + * @minor: If @class_dev is set, this is its dynamically allocated minor + * device number. (Set during post-configuration if necessary.) + * @readback: Optional pointer to memory allocated by + * comedi_alloc_subdev_readback() used to hold the values written to + * analog output channels so they can be read back. The storage is + * automatically freed after the low-level driver's "detach" handler is + * called for the device. (Initialized by the low-level driver.) + * + * This is the main control structure for a COMEDI subdevice. If the subdevice + * supports asynchronous acquisition commands, additional information is stored + * in the &struct comedi_async pointed to by @async. + * + * Most of the subdevice is initialized by the low-level driver's "attach" or + * "auto_attach" handlers but parts of it are initialized by + * comedi_alloc_subdevices(), and other parts are initialized during + * post-configuration on return from that handler. + * + * A low-level driver that sets @insn_bits for a digital input, digital output, + * or DIO subdevice may leave @insn_read and @insn_write uninitialized, in + * which case they will be set to a default handler during post-configuration + * that uses @insn_bits to emulate the %INSN_READ and %INSN_WRITE instructions. + */ +struct comedi_subdevice { + struct comedi_device *device; + int index; + int type; + int n_chan; + int subdev_flags; + int len_chanlist; /* maximum length of channel/gain list */ + + void *private; + + struct comedi_async *async; + + void *lock; + void *busy; + unsigned int runflags; + spinlock_t spin_lock; /* generic spin-lock for COMEDI and drivers */ + + unsigned int io_bits; + + unsigned int maxdata; /* if maxdata==0, use list */ + const unsigned int *maxdata_list; /* list is channel specific */ + + const struct comedi_lrange *range_table; + const struct comedi_lrange *const *range_table_list; + + unsigned int *chanlist; /* driver-owned chanlist (not used) */ + + int (*insn_read)(struct comedi_device *dev, struct comedi_subdevice *s, + struct comedi_insn *insn, unsigned int *data); + int (*insn_write)(struct comedi_device *dev, struct comedi_subdevice *s, + struct comedi_insn *insn, unsigned int *data); + int (*insn_bits)(struct comedi_device *dev, struct comedi_subdevice *s, + struct comedi_insn *insn, unsigned int *data); + int (*insn_config)(struct comedi_device *dev, + struct comedi_subdevice *s, + struct comedi_insn *insn, + unsigned int *data); + + int (*do_cmd)(struct comedi_device *dev, struct comedi_subdevice *s); + int (*do_cmdtest)(struct comedi_device *dev, + struct comedi_subdevice *s, + struct comedi_cmd *cmd); + int (*poll)(struct comedi_device *dev, struct comedi_subdevice *s); + int (*cancel)(struct comedi_device *dev, struct comedi_subdevice *s); + + /* called when the buffer changes */ + int (*buf_change)(struct comedi_device *dev, + struct comedi_subdevice *s); + + void (*munge)(struct comedi_device *dev, struct comedi_subdevice *s, + void *data, unsigned int num_bytes, + unsigned int start_chan_index); + enum dma_data_direction async_dma_dir; + + unsigned int state; + + struct device *class_dev; + int minor; + + unsigned int *readback; +}; + +/** + * struct comedi_buf_page - Describe a page of a COMEDI buffer + * @virt_addr: Kernel address of page. + * @dma_addr: DMA address of page if in DMA coherent memory. + */ +struct comedi_buf_page { + void *virt_addr; + dma_addr_t dma_addr; +}; + +/** + * struct comedi_buf_map - Describe pages in a COMEDI buffer + * @dma_hw_dev: Low-level hardware &struct device pointer copied from the + * COMEDI device's hw_dev member. + * @page_list: Pointer to array of &struct comedi_buf_page, one for each + * page in the buffer. + * @n_pages: Number of pages in the buffer. + * @dma_dir: DMA direction used to allocate pages of DMA coherent memory, + * or %DMA_NONE if pages allocated from regular memory. + * @refcount: &struct kref reference counter used to free the buffer. + * + * A COMEDI data buffer is allocated as individual pages, either in + * conventional memory or DMA coherent memory, depending on the attached, + * low-level hardware device. (The buffer pages also get mapped into the + * kernel's contiguous virtual address space pointed to by the 'prealloc_buf' + * member of &struct comedi_async.) + * + * The buffer is normally freed when the COMEDI device is detached from the + * low-level driver (which may happen due to device removal), but if it happens + * to be mmapped at the time, the pages cannot be freed until the buffer has + * been munmapped. That is what the reference counter is for. (The virtual + * address space pointed by 'prealloc_buf' is freed when the COMEDI device is + * detached.) + */ +struct comedi_buf_map { + struct device *dma_hw_dev; + struct comedi_buf_page *page_list; + unsigned int n_pages; + enum dma_data_direction dma_dir; + struct kref refcount; +}; + +/** + * struct comedi_async - Control data for asynchronous COMEDI commands + * @prealloc_buf: Kernel virtual address of allocated acquisition buffer. + * @prealloc_bufsz: Buffer size (in bytes). + * @buf_map: Map of buffer pages. + * @max_bufsize: Maximum allowed buffer size (in bytes). + * @buf_write_count: "Write completed" count (in bytes, modulo 2**32). + * @buf_write_alloc_count: "Allocated for writing" count (in bytes, + * modulo 2**32). + * @buf_read_count: "Read completed" count (in bytes, modulo 2**32). + * @buf_read_alloc_count: "Allocated for reading" count (in bytes, + * modulo 2**32). + * @buf_write_ptr: Buffer position for writer. + * @buf_read_ptr: Buffer position for reader. + * @cur_chan: Current position in chanlist for scan (for those drivers that + * use it). + * @scans_done: The number of scans completed. + * @scan_progress: Amount received or sent for current scan (in bytes). + * @munge_chan: Current position in chanlist for "munging". + * @munge_count: "Munge" count (in bytes, modulo 2**32). + * @munge_ptr: Buffer position for "munging". + * @events: Bit-vector of events that have occurred. + * @cmd: Details of comedi command in progress. + * @wait_head: Task wait queue for file reader or writer. + * @cb_mask: Bit-vector of events that should wake waiting tasks. + * @inttrig: Software trigger function for command, or NULL. + * + * Note about the ..._count and ..._ptr members: + * + * Think of the _Count values being integers of unlimited size, indexing + * into a buffer of infinite length (though only an advancing portion + * of the buffer of fixed length prealloc_bufsz is accessible at any + * time). Then: + * + * Buf_Read_Count <= Buf_Read_Alloc_Count <= Munge_Count <= + * Buf_Write_Count <= Buf_Write_Alloc_Count <= + * (Buf_Read_Count + prealloc_bufsz) + * + * (Those aren't the actual members, apart from prealloc_bufsz.) When the + * buffer is reset, those _Count values start at 0 and only increase in value, + * maintaining the above inequalities until the next time the buffer is + * reset. The buffer is divided into the following regions by the inequalities: + * + * [0, Buf_Read_Count): + * old region no longer accessible + * + * [Buf_Read_Count, Buf_Read_Alloc_Count): + * filled and munged region allocated for reading but not yet read + * + * [Buf_Read_Alloc_Count, Munge_Count): + * filled and munged region not yet allocated for reading + * + * [Munge_Count, Buf_Write_Count): + * filled region not yet munged + * + * [Buf_Write_Count, Buf_Write_Alloc_Count): + * unfilled region allocated for writing but not yet written + * + * [Buf_Write_Alloc_Count, Buf_Read_Count + prealloc_bufsz): + * unfilled region not yet allocated for writing + * + * [Buf_Read_Count + prealloc_bufsz, infinity): + * unfilled region not yet accessible + * + * Data needs to be written into the buffer before it can be read out, + * and may need to be converted (or "munged") between the two + * operations. Extra unfilled buffer space may need to allocated for + * writing (advancing Buf_Write_Alloc_Count) before new data is written. + * After writing new data, the newly filled space needs to be released + * (advancing Buf_Write_Count). This also results in the new data being + * "munged" (advancing Munge_Count). Before data is read out of the + * buffer, extra space may need to be allocated for reading (advancing + * Buf_Read_Alloc_Count). After the data has been read out, the space + * needs to be released (advancing Buf_Read_Count). + * + * The actual members, buf_read_count, buf_read_alloc_count, + * munge_count, buf_write_count, and buf_write_alloc_count take the + * value of the corresponding capitalized _Count values modulo 2^32 + * (UINT_MAX+1). Subtracting a "higher" _count value from a "lower" + * _count value gives the same answer as subtracting a "higher" _Count + * value from a lower _Count value because prealloc_bufsz < UINT_MAX+1. + * The modulo operation is done implicitly. + * + * The buf_read_ptr, munge_ptr, and buf_write_ptr members take the value + * of the corresponding capitalized _Count values modulo prealloc_bufsz. + * These correspond to byte indices in the physical buffer. The modulo + * operation is done by subtracting prealloc_bufsz when the value + * exceeds prealloc_bufsz (assuming prealloc_bufsz plus the increment is + * less than or equal to UINT_MAX). + */ +struct comedi_async { + void *prealloc_buf; + unsigned int prealloc_bufsz; + struct comedi_buf_map *buf_map; + unsigned int max_bufsize; + unsigned int buf_write_count; + unsigned int buf_write_alloc_count; + unsigned int buf_read_count; + unsigned int buf_read_alloc_count; + unsigned int buf_write_ptr; + unsigned int buf_read_ptr; + unsigned int cur_chan; + unsigned int scans_done; + unsigned int scan_progress; + unsigned int munge_chan; + unsigned int munge_count; + unsigned int munge_ptr; + unsigned int events; + struct comedi_cmd cmd; + wait_queue_head_t wait_head; + unsigned int cb_mask; + int (*inttrig)(struct comedi_device *dev, struct comedi_subdevice *s, + unsigned int x); +}; + +/** + * enum comedi_cb - &struct comedi_async callback "events" + * @COMEDI_CB_EOS: end-of-scan + * @COMEDI_CB_EOA: end-of-acquisition/output + * @COMEDI_CB_BLOCK: data has arrived, wakes up read() / write() + * @COMEDI_CB_EOBUF: DEPRECATED: end of buffer + * @COMEDI_CB_ERROR: card error during acquisition + * @COMEDI_CB_OVERFLOW: buffer overflow/underflow + * @COMEDI_CB_ERROR_MASK: events that indicate an error has occurred + * @COMEDI_CB_CANCEL_MASK: events that will cancel an async command + */ +enum comedi_cb { + COMEDI_CB_EOS = BIT(0), + COMEDI_CB_EOA = BIT(1), + COMEDI_CB_BLOCK = BIT(2), + COMEDI_CB_EOBUF = BIT(3), + COMEDI_CB_ERROR = BIT(4), + COMEDI_CB_OVERFLOW = BIT(5), + /* masks */ + COMEDI_CB_ERROR_MASK = (COMEDI_CB_ERROR | COMEDI_CB_OVERFLOW), + COMEDI_CB_CANCEL_MASK = (COMEDI_CB_EOA | COMEDI_CB_ERROR_MASK) +}; + +/** + * struct comedi_driver - COMEDI driver registration + * @driver_name: Name of driver. + * @module: Owning module. + * @attach: The optional "attach" handler for manually configured COMEDI + * devices. + * @detach: The "detach" handler for deconfiguring COMEDI devices. + * @auto_attach: The optional "auto_attach" handler for automatically + * configured COMEDI devices. + * @num_names: Optional number of "board names" supported. + * @board_name: Optional pointer to a pointer to a board name. The pointer + * to a board name is embedded in an element of a driver-defined array + * of static, read-only board type information. + * @offset: Optional size of each element of the driver-defined array of + * static, read-only board type information, i.e. the offset between each + * pointer to a board name. + * + * This is used with comedi_driver_register() and comedi_driver_unregister() to + * register and unregister a low-level COMEDI driver with the COMEDI core. + * + * If @num_names is non-zero, @board_name should be non-NULL, and @offset + * should be at least sizeof(*board_name). These are used by the handler for + * the %COMEDI_DEVCONFIG ioctl to match a hardware device and its driver by + * board name. If @num_names is zero, the %COMEDI_DEVCONFIG ioctl matches a + * hardware device and its driver by driver name. This is only useful if the + * @attach handler is set. If @num_names is non-zero, the driver's @attach + * handler will be called with the COMEDI device structure's board_ptr member + * pointing to the matched pointer to a board name within the driver's private + * array of static, read-only board type information. + * + * The @detach handler has two roles. If a COMEDI device was successfully + * configured by the @attach or @auto_attach handler, it is called when the + * device is being deconfigured (by the %COMEDI_DEVCONFIG ioctl, or due to + * unloading of the driver, or due to device removal). It is also called when + * the @attach or @auto_attach handler returns an error. Therefore, the + * @attach or @auto_attach handlers can defer clean-up on error until the + * @detach handler is called. If the @attach or @auto_attach handlers free + * any resources themselves, they must prevent the @detach handler from + * freeing the same resources. The @detach handler must not assume that all + * resources requested by the @attach or @auto_attach handler were + * successfully allocated. + */ +struct comedi_driver { + /* private: */ + struct comedi_driver *next; /* Next in list of COMEDI drivers. */ + /* public: */ + const char *driver_name; + struct module *module; + int (*attach)(struct comedi_device *dev, struct comedi_devconfig *it); + void (*detach)(struct comedi_device *dev); + int (*auto_attach)(struct comedi_device *dev, unsigned long context); + unsigned int num_names; + const char *const *board_name; + int offset; +}; + +/** + * struct comedi_device - Working data for a COMEDI device + * @use_count: Number of open file objects. + * @driver: Low-level COMEDI driver attached to this COMEDI device. + * @pacer: Optional pointer to a dynamically allocated acquisition pacer + * control. It is freed automatically after the COMEDI device is + * detached from the low-level driver. + * @private: Optional pointer to private data allocated by the low-level + * driver. It is freed automatically after the COMEDI device is + * detached from the low-level driver. + * @class_dev: Sysfs comediX device. + * @minor: Minor device number of COMEDI char device (0-47). + * @detach_count: Counter incremented every time the COMEDI device is detached. + * Used for checking a previous attachment is still valid. + * @hw_dev: Optional pointer to the low-level hardware &struct device. It is + * required for automatically configured COMEDI devices and optional for + * COMEDI devices configured by the %COMEDI_DEVCONFIG ioctl, although + * the bus-specific COMEDI functions only work if it is set correctly. + * It is also passed to dma_alloc_coherent() for COMEDI subdevices that + * have their 'async_dma_dir' member set to something other than + * %DMA_NONE. + * @board_name: Pointer to a COMEDI board name or a COMEDI driver name. When + * the low-level driver's "attach" handler is called by the handler for + * the %COMEDI_DEVCONFIG ioctl, it either points to a matched board name + * string if the 'num_names' member of the &struct comedi_driver is + * non-zero, otherwise it points to the low-level driver name string. + * When the low-lever driver's "auto_attach" handler is called for an + * automatically configured COMEDI device, it points to the low-level + * driver name string. The low-level driver is free to change it in its + * "attach" or "auto_attach" handler if it wishes. + * @board_ptr: Optional pointer to private, read-only board type information in + * the low-level driver. If the 'num_names' member of the &struct + * comedi_driver is non-zero, the handler for the %COMEDI_DEVCONFIG ioctl + * will point it to a pointer to a matched board name string within the + * driver's private array of static, read-only board type information when + * calling the driver's "attach" handler. The low-level driver is free to + * change it. + * @attached: Flag indicating that the COMEDI device is attached to a low-level + * driver. + * @ioenabled: Flag used to indicate that a PCI device has been enabled and + * its regions requested. + * @spinlock: Generic spin-lock for use by the low-level driver. + * @mutex: Generic mutex for use by the COMEDI core module. + * @attach_lock: &struct rw_semaphore used to guard against the COMEDI device + * being detached while an operation is in progress. The down_write() + * operation is only allowed while @mutex is held and is used when + * changing @attached and @detach_count and calling the low-level driver's + * "detach" handler. The down_read() operation is generally used without + * holding @mutex. + * @refcount: &struct kref reference counter for freeing COMEDI device. + * @n_subdevices: Number of COMEDI subdevices allocated by the low-level + * driver for this device. + * @subdevices: Dynamically allocated array of COMEDI subdevices. + * @mmio: Optional pointer to a remapped MMIO region set by the low-level + * driver. + * @iobase: Optional base of an I/O port region requested by the low-level + * driver. + * @iolen: Length of I/O port region requested at @iobase. + * @irq: Optional IRQ number requested by the low-level driver. + * @read_subdev: Optional pointer to a default COMEDI subdevice operated on by + * the read() file operation. Set by the low-level driver. + * @write_subdev: Optional pointer to a default COMEDI subdevice operated on by + * the write() file operation. Set by the low-level driver. + * @async_queue: Storage for fasync_helper(). + * @open: Optional pointer to a function set by the low-level driver to be + * called when @use_count changes from 0 to 1. + * @close: Optional pointer to a function set by the low-level driver to be + * called when @use_count changed from 1 to 0. + * @insn_device_config: Optional pointer to a handler for all sub-instructions + * except %INSN_DEVICE_CONFIG_GET_ROUTES of the %INSN_DEVICE_CONFIG + * instruction. If this is not initialized by the low-level driver, a + * default handler will be set during post-configuration. + * @get_valid_routes: Optional pointer to a handler for the + * %INSN_DEVICE_CONFIG_GET_ROUTES sub-instruction of the + * %INSN_DEVICE_CONFIG instruction set. If this is not initialized by the + * low-level driver, a default handler that copies zero routes back to the + * user will be used. + * + * This is the main control data structure for a COMEDI device (as far as the + * COMEDI core is concerned). There are two groups of COMEDI devices - + * "legacy" devices that are configured by the handler for the + * %COMEDI_DEVCONFIG ioctl, and automatically configured devices resulting + * from a call to comedi_auto_config() as a result of a bus driver probe in + * a low-level COMEDI driver. The "legacy" COMEDI devices are allocated + * during module initialization if the "comedi_num_legacy_minors" module + * parameter is non-zero and use minor device numbers from 0 to + * comedi_num_legacy_minors minus one. The automatically configured COMEDI + * devices are allocated on demand and use minor device numbers from + * comedi_num_legacy_minors to 47. + */ +struct comedi_device { + int use_count; + struct comedi_driver *driver; + struct comedi_8254 *pacer; + void *private; + + struct device *class_dev; + int minor; + unsigned int detach_count; + struct device *hw_dev; + + const char *board_name; + const void *board_ptr; + unsigned int attached:1; + unsigned int ioenabled:1; + spinlock_t spinlock; /* generic spin-lock for low-level driver */ + struct mutex mutex; /* generic mutex for COMEDI core */ + struct rw_semaphore attach_lock; + struct kref refcount; + + int n_subdevices; + struct comedi_subdevice *subdevices; + + /* dumb */ + void __iomem *mmio; + unsigned long iobase; + unsigned long iolen; + unsigned int irq; + + struct comedi_subdevice *read_subdev; + struct comedi_subdevice *write_subdev; + + struct fasync_struct *async_queue; + + int (*open)(struct comedi_device *dev); + void (*close)(struct comedi_device *dev); + int (*insn_device_config)(struct comedi_device *dev, + struct comedi_insn *insn, unsigned int *data); + unsigned int (*get_valid_routes)(struct comedi_device *dev, + unsigned int n_pairs, + unsigned int *pair_data); +}; + +/* + * function prototypes + */ + +void comedi_event(struct comedi_device *dev, struct comedi_subdevice *s); + +struct comedi_device *comedi_dev_get_from_minor(unsigned int minor); +int comedi_dev_put(struct comedi_device *dev); + +bool comedi_is_subdevice_running(struct comedi_subdevice *s); + +void *comedi_alloc_spriv(struct comedi_subdevice *s, size_t size); +void comedi_set_spriv_auto_free(struct comedi_subdevice *s); + +int comedi_check_chanlist(struct comedi_subdevice *s, + int n, + unsigned int *chanlist); + +/* range stuff */ + +#define RANGE(a, b) {(a) * 1e6, (b) * 1e6, 0} +#define RANGE_ext(a, b) {(a) * 1e6, (b) * 1e6, RF_EXTERNAL} +#define RANGE_mA(a, b) {(a) * 1e6, (b) * 1e6, UNIT_mA} +#define RANGE_unitless(a, b) {(a) * 1e6, (b) * 1e6, 0} +#define BIP_RANGE(a) {-(a) * 1e6, (a) * 1e6, 0} +#define UNI_RANGE(a) {0, (a) * 1e6, 0} + +extern const struct comedi_lrange range_bipolar10; +extern const struct comedi_lrange range_bipolar5; +extern const struct comedi_lrange range_bipolar2_5; +extern const struct comedi_lrange range_unipolar10; +extern const struct comedi_lrange range_unipolar5; +extern const struct comedi_lrange range_unipolar2_5; +extern const struct comedi_lrange range_0_20mA; +extern const struct comedi_lrange range_4_20mA; +extern const struct comedi_lrange range_0_32mA; +extern const struct comedi_lrange range_unknown; + +#define range_digital range_unipolar5 + +/** + * struct comedi_lrange - Describes a COMEDI range table + * @length: Number of entries in the range table. + * @range: Array of &struct comedi_krange, one for each range. + * + * Each element of @range[] describes the minimum and maximum physical range + * and the type of units. Typically, the type of unit is %UNIT_volt + * (i.e. volts) and the minimum and maximum are in millionths of a volt. + * There may also be a flag that indicates the minimum and maximum are merely + * scale factors for an unknown, external reference. + */ +struct comedi_lrange { + int length; + struct comedi_krange range[]; +}; + +/** + * comedi_range_is_bipolar() - Test if subdevice range is bipolar + * @s: COMEDI subdevice. + * @range: Index of range within a range table. + * + * Tests whether a range is bipolar by checking whether its minimum value + * is negative. + * + * Assumes @range is valid. Does not work for subdevices using a + * channel-specific range table list. + * + * Return: + * %true if the range is bipolar. + * %false if the range is unipolar. + */ +static inline bool comedi_range_is_bipolar(struct comedi_subdevice *s, + unsigned int range) +{ + return s->range_table->range[range].min < 0; +} + +/** + * comedi_range_is_unipolar() - Test if subdevice range is unipolar + * @s: COMEDI subdevice. + * @range: Index of range within a range table. + * + * Tests whether a range is unipolar by checking whether its minimum value + * is at least 0. + * + * Assumes @range is valid. Does not work for subdevices using a + * channel-specific range table list. + * + * Return: + * %true if the range is unipolar. + * %false if the range is bipolar. + */ +static inline bool comedi_range_is_unipolar(struct comedi_subdevice *s, + unsigned int range) +{ + return s->range_table->range[range].min >= 0; +} + +/** + * comedi_range_is_external() - Test if subdevice range is external + * @s: COMEDI subdevice. + * @range: Index of range within a range table. + * + * Tests whether a range is externally reference by checking whether its + * %RF_EXTERNAL flag is set. + * + * Assumes @range is valid. Does not work for subdevices using a + * channel-specific range table list. + * + * Return: + * %true if the range is external. + * %false if the range is internal. + */ +static inline bool comedi_range_is_external(struct comedi_subdevice *s, + unsigned int range) +{ + return !!(s->range_table->range[range].flags & RF_EXTERNAL); +} + +/** + * comedi_chan_range_is_bipolar() - Test if channel-specific range is bipolar + * @s: COMEDI subdevice. + * @chan: The channel number. + * @range: Index of range within a range table. + * + * Tests whether a range is bipolar by checking whether its minimum value + * is negative. + * + * Assumes @chan and @range are valid. Only works for subdevices with a + * channel-specific range table list. + * + * Return: + * %true if the range is bipolar. + * %false if the range is unipolar. + */ +static inline bool comedi_chan_range_is_bipolar(struct comedi_subdevice *s, + unsigned int chan, + unsigned int range) +{ + return s->range_table_list[chan]->range[range].min < 0; +} + +/** + * comedi_chan_range_is_unipolar() - Test if channel-specific range is unipolar + * @s: COMEDI subdevice. + * @chan: The channel number. + * @range: Index of range within a range table. + * + * Tests whether a range is unipolar by checking whether its minimum value + * is at least 0. + * + * Assumes @chan and @range are valid. Only works for subdevices with a + * channel-specific range table list. + * + * Return: + * %true if the range is unipolar. + * %false if the range is bipolar. + */ +static inline bool comedi_chan_range_is_unipolar(struct comedi_subdevice *s, + unsigned int chan, + unsigned int range) +{ + return s->range_table_list[chan]->range[range].min >= 0; +} + +/** + * comedi_chan_range_is_external() - Test if channel-specific range is external + * @s: COMEDI subdevice. + * @chan: The channel number. + * @range: Index of range within a range table. + * + * Tests whether a range is externally reference by checking whether its + * %RF_EXTERNAL flag is set. + * + * Assumes @chan and @range are valid. Only works for subdevices with a + * channel-specific range table list. + * + * Return: + * %true if the range is bipolar. + * %false if the range is unipolar. + */ +static inline bool comedi_chan_range_is_external(struct comedi_subdevice *s, + unsigned int chan, + unsigned int range) +{ + return !!(s->range_table_list[chan]->range[range].flags & RF_EXTERNAL); +} + +/** + * comedi_offset_munge() - Convert between offset binary and 2's complement + * @s: COMEDI subdevice. + * @val: Value to be converted. + * + * Toggles the highest bit of a sample value to toggle between offset binary + * and 2's complement. Assumes that @s->maxdata is a power of 2 minus 1. + * + * Return: The converted value. + */ +static inline unsigned int comedi_offset_munge(struct comedi_subdevice *s, + unsigned int val) +{ + return val ^ s->maxdata ^ (s->maxdata >> 1); +} + +/** + * comedi_bytes_per_sample() - Determine subdevice sample size + * @s: COMEDI subdevice. + * + * The sample size will be 4 (sizeof int) or 2 (sizeof short) depending on + * whether the %SDF_LSAMPL subdevice flag is set or not. + * + * Return: The subdevice sample size. + */ +static inline unsigned int comedi_bytes_per_sample(struct comedi_subdevice *s) +{ + return s->subdev_flags & SDF_LSAMPL ? sizeof(int) : sizeof(short); +} + +/** + * comedi_sample_shift() - Determine log2 of subdevice sample size + * @s: COMEDI subdevice. + * + * The sample size will be 4 (sizeof int) or 2 (sizeof short) depending on + * whether the %SDF_LSAMPL subdevice flag is set or not. The log2 of the + * sample size will be 2 or 1 and can be used as the right operand of a + * bit-shift operator to multiply or divide something by the sample size. + * + * Return: log2 of the subdevice sample size. + */ +static inline unsigned int comedi_sample_shift(struct comedi_subdevice *s) +{ + return s->subdev_flags & SDF_LSAMPL ? 2 : 1; +} + +/** + * comedi_bytes_to_samples() - Convert a number of bytes to a number of samples + * @s: COMEDI subdevice. + * @nbytes: Number of bytes + * + * Return: The number of bytes divided by the subdevice sample size. + */ +static inline unsigned int comedi_bytes_to_samples(struct comedi_subdevice *s, + unsigned int nbytes) +{ + return nbytes >> comedi_sample_shift(s); +} + +/** + * comedi_samples_to_bytes() - Convert a number of samples to a number of bytes + * @s: COMEDI subdevice. + * @nsamples: Number of samples. + * + * Return: The number of samples multiplied by the subdevice sample size. + * (Does not check for arithmetic overflow.) + */ +static inline unsigned int comedi_samples_to_bytes(struct comedi_subdevice *s, + unsigned int nsamples) +{ + return nsamples << comedi_sample_shift(s); +} + +/** + * comedi_check_trigger_src() - Trivially validate a comedi_cmd trigger source + * @src: Pointer to the trigger source to validate. + * @flags: Bitmask of valid %TRIG_* for the trigger. + * + * This is used in "step 1" of the do_cmdtest functions of comedi drivers + * to validate the comedi_cmd triggers. The mask of the @src against the + * @flags allows the userspace comedilib to pass all the comedi_cmd + * triggers as %TRIG_ANY and get back a bitmask of the valid trigger sources. + * + * Return: + * 0 if trigger sources in *@src are all supported. + * -EINVAL if any trigger source in *@src is unsupported. + */ +static inline int comedi_check_trigger_src(unsigned int *src, + unsigned int flags) +{ + unsigned int orig_src = *src; + + *src = orig_src & flags; + if (*src == TRIG_INVALID || *src != orig_src) + return -EINVAL; + return 0; +} + +/** + * comedi_check_trigger_is_unique() - Make sure a trigger source is unique + * @src: The trigger source to check. + * + * Return: + * 0 if no more than one trigger source is set. + * -EINVAL if more than one trigger source is set. + */ +static inline int comedi_check_trigger_is_unique(unsigned int src) +{ + /* this test is true if more than one _src bit is set */ + if ((src & (src - 1)) != 0) + return -EINVAL; + return 0; +} + +/** + * comedi_check_trigger_arg_is() - Trivially validate a trigger argument + * @arg: Pointer to the trigger arg to validate. + * @val: The value the argument should be. + * + * Forces *@arg to be @val. + * + * Return: + * 0 if *@arg was already @val. + * -EINVAL if *@arg differed from @val. + */ +static inline int comedi_check_trigger_arg_is(unsigned int *arg, + unsigned int val) +{ + if (*arg != val) { + *arg = val; + return -EINVAL; + } + return 0; +} + +/** + * comedi_check_trigger_arg_min() - Trivially validate a trigger argument min + * @arg: Pointer to the trigger arg to validate. + * @val: The minimum value the argument should be. + * + * Forces *@arg to be at least @val, setting it to @val if necessary. + * + * Return: + * 0 if *@arg was already at least @val. + * -EINVAL if *@arg was less than @val. + */ +static inline int comedi_check_trigger_arg_min(unsigned int *arg, + unsigned int val) +{ + if (*arg < val) { + *arg = val; + return -EINVAL; + } + return 0; +} + +/** + * comedi_check_trigger_arg_max() - Trivially validate a trigger argument max + * @arg: Pointer to the trigger arg to validate. + * @val: The maximum value the argument should be. + * + * Forces *@arg to be no more than @val, setting it to @val if necessary. + * + * Return: + * 0 if*@arg was already no more than @val. + * -EINVAL if *@arg was greater than @val. + */ +static inline int comedi_check_trigger_arg_max(unsigned int *arg, + unsigned int val) +{ + if (*arg > val) { + *arg = val; + return -EINVAL; + } + return 0; +} + +/* + * Must set dev->hw_dev if you wish to dma directly into comedi's buffer. + * Also useful for retrieving a previously configured hardware device of + * known bus type. Set automatically for auto-configured devices. + * Automatically set to NULL when detaching hardware device. + */ +int comedi_set_hw_dev(struct comedi_device *dev, struct device *hw_dev); + +/** + * comedi_buf_n_bytes_ready - Determine amount of unread data in buffer + * @s: COMEDI subdevice. + * + * Determines the number of bytes of unread data in the asynchronous + * acquisition data buffer for a subdevice. The data in question might not + * have been fully "munged" yet. + * + * Returns: The amount of unread data in bytes. + */ +static inline unsigned int comedi_buf_n_bytes_ready(struct comedi_subdevice *s) +{ + return s->async->buf_write_count - s->async->buf_read_count; +} + +unsigned int comedi_buf_write_alloc(struct comedi_subdevice *s, unsigned int n); +unsigned int comedi_buf_write_free(struct comedi_subdevice *s, unsigned int n); + +unsigned int comedi_buf_read_n_available(struct comedi_subdevice *s); +unsigned int comedi_buf_read_alloc(struct comedi_subdevice *s, unsigned int n); +unsigned int comedi_buf_read_free(struct comedi_subdevice *s, unsigned int n); + +unsigned int comedi_buf_write_samples(struct comedi_subdevice *s, + const void *data, unsigned int nsamples); +unsigned int comedi_buf_read_samples(struct comedi_subdevice *s, + void *data, unsigned int nsamples); + +/* drivers.c - general comedi driver functions */ + +#define COMEDI_TIMEOUT_MS 1000 + +int comedi_timeout(struct comedi_device *dev, struct comedi_subdevice *s, + struct comedi_insn *insn, + int (*cb)(struct comedi_device *dev, + struct comedi_subdevice *s, + struct comedi_insn *insn, unsigned long context), + unsigned long context); + +unsigned int comedi_handle_events(struct comedi_device *dev, + struct comedi_subdevice *s); + +int comedi_dio_insn_config(struct comedi_device *dev, + struct comedi_subdevice *s, + struct comedi_insn *insn, unsigned int *data, + unsigned int mask); +unsigned int comedi_dio_update_state(struct comedi_subdevice *s, + unsigned int *data); +unsigned int comedi_bytes_per_scan_cmd(struct comedi_subdevice *s, + struct comedi_cmd *cmd); +unsigned int comedi_bytes_per_scan(struct comedi_subdevice *s); +unsigned int comedi_nscans_left(struct comedi_subdevice *s, + unsigned int nscans); +unsigned int comedi_nsamples_left(struct comedi_subdevice *s, + unsigned int nsamples); +void comedi_inc_scan_progress(struct comedi_subdevice *s, + unsigned int num_bytes); + +void *comedi_alloc_devpriv(struct comedi_device *dev, size_t size); +int comedi_alloc_subdevices(struct comedi_device *dev, int num_subdevices); +int comedi_alloc_subdev_readback(struct comedi_subdevice *s); + +int comedi_readback_insn_read(struct comedi_device *dev, + struct comedi_subdevice *s, + struct comedi_insn *insn, unsigned int *data); + +int comedi_load_firmware(struct comedi_device *dev, struct device *hw_dev, + const char *name, + int (*cb)(struct comedi_device *dev, + const u8 *data, size_t size, + unsigned long context), + unsigned long context); + +int __comedi_request_region(struct comedi_device *dev, + unsigned long start, unsigned long len); +int comedi_request_region(struct comedi_device *dev, + unsigned long start, unsigned long len); +void comedi_legacy_detach(struct comedi_device *dev); + +int comedi_auto_config(struct device *hardware_device, + struct comedi_driver *driver, unsigned long context); +void comedi_auto_unconfig(struct device *hardware_device); + +int comedi_driver_register(struct comedi_driver *driver); +void comedi_driver_unregister(struct comedi_driver *driver); + +/** + * module_comedi_driver() - Helper macro for registering a comedi driver + * @__comedi_driver: comedi_driver struct + * + * Helper macro for comedi drivers which do not do anything special in module + * init/exit. This eliminates a lot of boilerplate. Each module may only use + * this macro once, and calling it replaces module_init() and module_exit(). + */ +#define module_comedi_driver(__comedi_driver) \ + module_driver(__comedi_driver, comedi_driver_register, \ + comedi_driver_unregister) + +#endif /* _COMEDIDEV_H */ diff --git a/include/linux/comedi/comedilib.h b/include/linux/comedi/comedilib.h new file mode 100644 index 000000000000..0223c9cd9215 --- /dev/null +++ b/include/linux/comedi/comedilib.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * comedilib.h + * Header file for kcomedilib + * + * COMEDI - Linux Control and Measurement Device Interface + * Copyright (C) 1998-2001 David A. Schleef + */ + +#ifndef _LINUX_COMEDILIB_H +#define _LINUX_COMEDILIB_H + +struct comedi_device *comedi_open(const char *path); +int comedi_close(struct comedi_device *dev); +int comedi_dio_get_config(struct comedi_device *dev, unsigned int subdev, + unsigned int chan, unsigned int *io); +int comedi_dio_config(struct comedi_device *dev, unsigned int subdev, + unsigned int chan, unsigned int io); +int comedi_dio_bitfield2(struct comedi_device *dev, unsigned int subdev, + unsigned int mask, unsigned int *bits, + unsigned int base_channel); +int comedi_find_subdevice_by_type(struct comedi_device *dev, int type, + unsigned int subd); +int comedi_get_n_channels(struct comedi_device *dev, unsigned int subdevice); + +#endif -- cgit v1.2.3 From 631e272b12075b60f7c7fc4f84f937d78a699844 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Wed, 17 Nov 2021 12:06:01 +0000 Subject: comedi: Move and rename "8255.h" to Some of the header files in "drivers/comedi/drivers/" are common enough to be useful to out-of-tree comedi driver modules. Using them for out-of-tree module builds is hampered by the headers being outside the "include/" directory so it is desirable to move them. There are about a couple of dozen Comedi device drivers that use the "comedi_8255" module to add digital I/O subdevices based on the venerable 8255 Programmable Peripheral Interface chip. The macros and declarations to use that module are in the "8255.h" header file in the comedi "drivers" directory. Move it into "include/linux/comedi/" and rename it to "comedi_8255.h" for naming consistency reasons. Signed-off-by: Ian Abbott Link: https://lore.kernel.org/r/20211117120604.117740-4-abbotti@mev.co.uk Signed-off-by: Greg Kroah-Hartman --- include/linux/comedi/comedi_8255.h | 42 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 include/linux/comedi/comedi_8255.h (limited to 'include/linux') diff --git a/include/linux/comedi/comedi_8255.h b/include/linux/comedi/comedi_8255.h new file mode 100644 index 000000000000..b2a5bc6b3a49 --- /dev/null +++ b/include/linux/comedi/comedi_8255.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * comedi_8255.h + * Generic 8255 digital I/O subdevice support + * + * COMEDI - Linux Control and Measurement Device Interface + * Copyright (C) 1998 David A. Schleef + */ + +#ifndef _COMEDI_8255_H +#define _COMEDI_8255_H + +#define I8255_SIZE 0x04 + +#define I8255_DATA_A_REG 0x00 +#define I8255_DATA_B_REG 0x01 +#define I8255_DATA_C_REG 0x02 +#define I8255_CTRL_REG 0x03 +#define I8255_CTRL_C_LO_IO BIT(0) +#define I8255_CTRL_B_IO BIT(1) +#define I8255_CTRL_B_MODE BIT(2) +#define I8255_CTRL_C_HI_IO BIT(3) +#define I8255_CTRL_A_IO BIT(4) +#define I8255_CTRL_A_MODE(x) ((x) << 5) +#define I8255_CTRL_CW BIT(7) + +struct comedi_device; +struct comedi_subdevice; + +int subdev_8255_init(struct comedi_device *dev, struct comedi_subdevice *s, + int (*io)(struct comedi_device *dev, int dir, int port, + int data, unsigned long regbase), + unsigned long regbase); + +int subdev_8255_mm_init(struct comedi_device *dev, struct comedi_subdevice *s, + int (*io)(struct comedi_device *dev, int dir, int port, + int data, unsigned long regbase), + unsigned long regbase); + +unsigned long subdev_8255_regbase(struct comedi_subdevice *s); + +#endif -- cgit v1.2.3 From 44fb7affcfa4e968e9c2ede023ef0e15f06d8209 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Wed, 17 Nov 2021 12:06:02 +0000 Subject: comedi: Move "comedi_8254.h" to Some of the header files in "drivers/comedi/drivers/" are common enough to be useful to out-of-tree comedi driver modules. Using them for out-of-tree module builds is hampered by the headers being outside the "include/" directory so it is desirable to move them. There are about a couple of dozen or so Comedi device drivers that use the "comedi_8254" module to add timers based on the venerable 8254 Programmable Interval Timer chip. The macros and declarations to use that module are in the "comedi_8254.h" header file in the comedi "drivers" directory. Move it into "include/linux/comedi/". Signed-off-by: Ian Abbott Link: https://lore.kernel.org/r/20211117120604.117740-5-abbotti@mev.co.uk Signed-off-by: Greg Kroah-Hartman --- include/linux/comedi/comedi_8254.h | 134 +++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 include/linux/comedi/comedi_8254.h (limited to 'include/linux') diff --git a/include/linux/comedi/comedi_8254.h b/include/linux/comedi/comedi_8254.h new file mode 100644 index 000000000000..d8264417e53c --- /dev/null +++ b/include/linux/comedi/comedi_8254.h @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * comedi_8254.h + * Generic 8254 timer/counter support + * Copyright (C) 2014 H Hartley Sweeten + * + * COMEDI - Linux Control and Measurement Device Interface + * Copyright (C) 2000 David A. Schleef + */ + +#ifndef _COMEDI_8254_H +#define _COMEDI_8254_H + +#include + +struct comedi_device; +struct comedi_insn; +struct comedi_subdevice; + +/* + * Common oscillator base values in nanoseconds + */ +#define I8254_OSC_BASE_10MHZ 100 +#define I8254_OSC_BASE_5MHZ 200 +#define I8254_OSC_BASE_4MHZ 250 +#define I8254_OSC_BASE_2MHZ 500 +#define I8254_OSC_BASE_1MHZ 1000 +#define I8254_OSC_BASE_100KHZ 10000 +#define I8254_OSC_BASE_10KHZ 100000 +#define I8254_OSC_BASE_1KHZ 1000000 + +/* + * I/O access size used to read/write registers + */ +#define I8254_IO8 1 +#define I8254_IO16 2 +#define I8254_IO32 4 + +/* + * Register map for generic 8254 timer (I8254_IO8 with 0 regshift) + */ +#define I8254_COUNTER0_REG 0x00 +#define I8254_COUNTER1_REG 0x01 +#define I8254_COUNTER2_REG 0x02 +#define I8254_CTRL_REG 0x03 +#define I8254_CTRL_SEL_CTR(x) ((x) << 6) +#define I8254_CTRL_READBACK(x) (I8254_CTRL_SEL_CTR(3) | BIT(x)) +#define I8254_CTRL_READBACK_COUNT I8254_CTRL_READBACK(4) +#define I8254_CTRL_READBACK_STATUS I8254_CTRL_READBACK(5) +#define I8254_CTRL_READBACK_SEL_CTR(x) (2 << (x)) +#define I8254_CTRL_RW(x) (((x) & 0x3) << 4) +#define I8254_CTRL_LATCH I8254_CTRL_RW(0) +#define I8254_CTRL_LSB_ONLY I8254_CTRL_RW(1) +#define I8254_CTRL_MSB_ONLY I8254_CTRL_RW(2) +#define I8254_CTRL_LSB_MSB I8254_CTRL_RW(3) + +/* counter maps zero to 0x10000 */ +#define I8254_MAX_COUNT 0x10000 + +/** + * struct comedi_8254 - private data used by this module + * @iobase: PIO base address of the registers (in/out) + * @mmio: MMIO base address of the registers (read/write) + * @iosize: I/O size used to access the registers (b/w/l) + * @regshift: register gap shift + * @osc_base: cascaded oscillator speed in ns + * @divisor: divisor for single counter + * @divisor1: divisor loaded into first cascaded counter + * @divisor2: divisor loaded into second cascaded counter + * #next_div: next divisor for single counter + * @next_div1: next divisor to use for first cascaded counter + * @next_div2: next divisor to use for second cascaded counter + * @clock_src; current clock source for each counter (driver specific) + * @gate_src; current gate source for each counter (driver specific) + * @busy: flags used to indicate that a counter is "busy" + * @insn_config: driver specific (*insn_config) callback + */ +struct comedi_8254 { + unsigned long iobase; + void __iomem *mmio; + unsigned int iosize; + unsigned int regshift; + unsigned int osc_base; + unsigned int divisor; + unsigned int divisor1; + unsigned int divisor2; + unsigned int next_div; + unsigned int next_div1; + unsigned int next_div2; + unsigned int clock_src[3]; + unsigned int gate_src[3]; + bool busy[3]; + + int (*insn_config)(struct comedi_device *dev, + struct comedi_subdevice *s, + struct comedi_insn *insn, unsigned int *data); +}; + +unsigned int comedi_8254_status(struct comedi_8254 *i8254, + unsigned int counter); +unsigned int comedi_8254_read(struct comedi_8254 *i8254, unsigned int counter); +void comedi_8254_write(struct comedi_8254 *i8254, + unsigned int counter, unsigned int val); + +int comedi_8254_set_mode(struct comedi_8254 *i8254, + unsigned int counter, unsigned int mode); +int comedi_8254_load(struct comedi_8254 *i8254, + unsigned int counter, unsigned int val, unsigned int mode); + +void comedi_8254_pacer_enable(struct comedi_8254 *i8254, + unsigned int counter1, unsigned int counter2, + bool enable); +void comedi_8254_update_divisors(struct comedi_8254 *i8254); +void comedi_8254_cascade_ns_to_timer(struct comedi_8254 *i8254, + unsigned int *nanosec, unsigned int flags); +void comedi_8254_ns_to_timer(struct comedi_8254 *i8254, + unsigned int *nanosec, unsigned int flags); + +void comedi_8254_set_busy(struct comedi_8254 *i8254, + unsigned int counter, bool busy); + +void comedi_8254_subdevice_init(struct comedi_subdevice *s, + struct comedi_8254 *i8254); + +struct comedi_8254 *comedi_8254_init(unsigned long iobase, + unsigned int osc_base, + unsigned int iosize, + unsigned int regshift); +struct comedi_8254 *comedi_8254_mm_init(void __iomem *mmio, + unsigned int osc_base, + unsigned int iosize, + unsigned int regshift); + +#endif /* _COMEDI_8254_H */ -- cgit v1.2.3 From fe7a4f5b9548456246ffda143bab59922acda9fd Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Wed, 17 Nov 2021 12:06:03 +0000 Subject: comedi: Move "comedi_isadma.h" to Some of the header files in "drivers/comedi/drivers/" are common enough to be useful to out-of-tree comedi driver modules. Using them for out-of-tree module builds is hampered by the headers being outside the "include/" directory so it is desirable to move them. There are about a half a dozen or so Comedi device drivers that use the "comedi_isadma" module to add ISA DMA support. The macros and declarations to use that module are in the "comedi_isadma.h" header file in the comedi "drivers" directory. Move it into "include/linux/comedi/". Signed-off-by: Ian Abbott Link: https://lore.kernel.org/r/20211117120604.117740-6-abbotti@mev.co.uk Signed-off-by: Greg Kroah-Hartman --- include/linux/comedi/comedi_isadma.h | 114 +++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 include/linux/comedi/comedi_isadma.h (limited to 'include/linux') diff --git a/include/linux/comedi/comedi_isadma.h b/include/linux/comedi/comedi_isadma.h new file mode 100644 index 000000000000..9d2b12db7e6e --- /dev/null +++ b/include/linux/comedi/comedi_isadma.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * COMEDI ISA DMA support functions + * Copyright (c) 2014 H Hartley Sweeten + */ + +#ifndef _COMEDI_ISADMA_H +#define _COMEDI_ISADMA_H + +#include + +struct comedi_device; +struct device; + +/* + * These are used to avoid issues when and the DMA_MODE_ + * defines are not available. + */ +#define COMEDI_ISADMA_READ 0 +#define COMEDI_ISADMA_WRITE 1 + +/** + * struct comedi_isadma_desc - cookie for ISA DMA + * @virt_addr: virtual address of buffer + * @hw_addr: hardware (bus) address of buffer + * @chan: DMA channel + * @maxsize: allocated size of buffer (in bytes) + * @size: transfer size (in bytes) + * @mode: DMA_MODE_READ or DMA_MODE_WRITE + */ +struct comedi_isadma_desc { + void *virt_addr; + dma_addr_t hw_addr; + unsigned int chan; + unsigned int maxsize; + unsigned int size; + char mode; +}; + +/** + * struct comedi_isadma - ISA DMA data + * @dev: device to allocate non-coherent memory for + * @desc: cookie for each DMA buffer + * @n_desc: the number of cookies + * @cur_dma: the current cookie in use + * @chan: the first DMA channel requested + * @chan2: the second DMA channel requested + */ +struct comedi_isadma { + struct device *dev; + struct comedi_isadma_desc *desc; + int n_desc; + int cur_dma; + unsigned int chan; + unsigned int chan2; +}; + +#if IS_ENABLED(CONFIG_ISA_DMA_API) + +void comedi_isadma_program(struct comedi_isadma_desc *desc); +unsigned int comedi_isadma_disable(unsigned int dma_chan); +unsigned int comedi_isadma_disable_on_sample(unsigned int dma_chan, + unsigned int size); +unsigned int comedi_isadma_poll(struct comedi_isadma *dma); +void comedi_isadma_set_mode(struct comedi_isadma_desc *desc, char dma_dir); + +struct comedi_isadma *comedi_isadma_alloc(struct comedi_device *dev, + int n_desc, unsigned int dma_chan1, + unsigned int dma_chan2, + unsigned int maxsize, char dma_dir); +void comedi_isadma_free(struct comedi_isadma *dma); + +#else /* !IS_ENABLED(CONFIG_ISA_DMA_API) */ + +static inline void comedi_isadma_program(struct comedi_isadma_desc *desc) +{ +} + +static inline unsigned int comedi_isadma_disable(unsigned int dma_chan) +{ + return 0; +} + +static inline unsigned int +comedi_isadma_disable_on_sample(unsigned int dma_chan, unsigned int size) +{ + return 0; +} + +static inline unsigned int comedi_isadma_poll(struct comedi_isadma *dma) +{ + return 0; +} + +static inline void comedi_isadma_set_mode(struct comedi_isadma_desc *desc, + char dma_dir) +{ +} + +static inline struct comedi_isadma * +comedi_isadma_alloc(struct comedi_device *dev, int n_desc, + unsigned int dma_chan1, unsigned int dma_chan2, + unsigned int maxsize, char dma_dir) +{ + return NULL; +} + +static inline void comedi_isadma_free(struct comedi_isadma *dma) +{ +} + +#endif /* !IS_ENABLED(CONFIG_ISA_DMA_API) */ + +#endif /* #ifndef _COMEDI_ISADMA_H */ -- cgit v1.2.3 From 2cca3465147d650be3de04927a99784b30251ade Mon Sep 17 00:00:00 2001 From: Alexander Usyskin Date: Fri, 12 Nov 2021 08:28:09 +0200 Subject: mei: bus: add client dma interface Expose the client dma mapping via mei client bus interface. The client dma has to be mapped before the device is enabled, therefore we need to create device linking already during mapping and we need to unmap after the client is disable hence we need to postpone the unlink and flush till unmapping or when destroying the device. Signed-off-by: Alexander Usyskin Co-developed-by: Tomas Winkler Signed-off-by: Tomas Winkler Signed-off-by: Emmanuel Grumbach Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20210420172755.12178-1-emmanuel.grumbach@intel.com Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211112062814.7502-1-emmanuel.grumbach@intel.com --- include/linux/mei_cl_bus.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h index c6786c12b207..df1fab44ea5c 100644 --- a/include/linux/mei_cl_bus.h +++ b/include/linux/mei_cl_bus.h @@ -117,4 +117,7 @@ int mei_cldev_enable(struct mei_cl_device *cldev); int mei_cldev_disable(struct mei_cl_device *cldev); bool mei_cldev_enabled(const struct mei_cl_device *cldev); +void *mei_cldev_dma_map(struct mei_cl_device *cldev, u8 buffer_id, size_t size); +int mei_cldev_dma_unmap(struct mei_cl_device *cldev); + #endif /* _LINUX_MEI_CL_BUS_H */ -- cgit v1.2.3 From ec15baec3272bbec576f2ce7ce47765a8e9b7b1c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 26 Nov 2021 19:28:43 +0200 Subject: net: ptp: add a definition for the UDP port for IEEE 1588 general messages As opposed to event messages (Sync, PdelayReq etc) which require timestamping, general messages (Announce, FollowUp etc) do not. In PTP they are part of different streams of data. IEEE 1588-2008 Annex D.2 "UDP port numbers" states that the UDP destination port assigned by IANA is 319 for event messages, and 320 for general messages. Yet the kernel seems to be missing the definition for general messages. This patch adds it. Signed-off-by: Vladimir Oltean Acked-by: Richard Cochran Signed-off-by: Jakub Kicinski --- include/linux/ptp_classify.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h index ae04968a3a47..9afd34a2d36c 100644 --- a/include/linux/ptp_classify.h +++ b/include/linux/ptp_classify.h @@ -37,6 +37,7 @@ #define PTP_MSGTYPE_PDELAY_RESP 0x3 #define PTP_EV_PORT 319 +#define PTP_GEN_PORT 320 #define PTP_GEN_BIT 0x08 /* indicates general message, if set in message type */ #define OFF_PTP_SOURCE_UUID 22 /* PTPv1 only */ -- cgit v1.2.3 From a6914afcdf0e3fb853fce0e0c04710be7427b62f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 10 Nov 2021 12:31:28 +0200 Subject: kobject: Replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211110103128.59888-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/kobject.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kobject.h b/include/linux/kobject.h index efd56f990a46..c740062b4b1a 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -19,10 +19,10 @@ #include #include #include +#include #include #include #include -#include #include #include #include -- cgit v1.2.3 From 6a2d2ddf2c345e0149bfbffdddc4768a9ab0a741 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 12 Nov 2021 14:32:27 +0100 Subject: drm: Move nomodeset kernel parameter to the DRM subsystem The "nomodeset" kernel cmdline parameter is handled by the vgacon driver but the exported vgacon_text_force() symbol is only used by DRM drivers. It makes much more sense for the parameter logic to be in the subsystem of the drivers that are making use of it. Let's move the vgacon_text_force() function and related logic to the DRM subsystem. While doing that, rename it to drm_firmware_drivers_only() and make it return true if "nomodeset" was used and false otherwise. This is a better description of the condition that the drivers are testing for. Suggested-by: Daniel Vetter Signed-off-by: Javier Martinez Canillas Acked-by: Thomas Zimmermann Acked-by: Jani Nikula Acked-by: Pekka Paalanen Acked-by: Greg Kroah-Hartman Link: https://patchwork.freedesktop.org/patch/msgid/20211112133230.1595307-4-javierm@redhat.com --- include/linux/console.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index a97f277cfdfa..7cd758a4f44e 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -219,12 +219,6 @@ extern atomic_t ignore_console_lock_warning; #define VESA_HSYNC_SUSPEND 2 #define VESA_POWERDOWN 3 -#ifdef CONFIG_VGA_CONSOLE -extern bool vgacon_text_force(void); -#else -static inline bool vgacon_text_force(void) { return false; } -#endif - extern void console_init(void); /* For deferred console takeover */ -- cgit v1.2.3 From ed14e769f64311769dcf20dde544b82c158d01b1 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Mon, 15 Nov 2021 14:19:12 +0000 Subject: iio: buffer-dma: Remove unused iio_buffer_block struct This structure was never used anywhere, so it can safely be dropped. It will later be re-introduced as a different structure in a different header. Signed-off-by: Paul Cercueil Reviewed-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20211115141925.60164-3-paul@crapouillou.net Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer-dma.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/buffer-dma.h b/include/linux/iio/buffer-dma.h index ff15c61bf319..6564bdcdac66 100644 --- a/include/linux/iio/buffer-dma.h +++ b/include/linux/iio/buffer-dma.h @@ -17,11 +17,6 @@ struct iio_dma_buffer_queue; struct iio_dma_buffer_ops; struct device; -struct iio_buffer_block { - u32 size; - u32 bytes_used; -}; - /** * enum iio_block_state - State of a struct iio_dma_buffer_block * @IIO_BLOCK_STATE_DEQUEUED: Block is not queued -- cgit v1.2.3 From ffc7c5172a6d1f7ec468066a7172ce65baf1e3e1 Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Fri, 19 Nov 2021 10:56:27 +0200 Subject: iio: expose shared parameter in IIO_ENUM_AVAILABLE The shared parameter should be configurable based on its usage, and not constrained to IIO_SHARED_BY_TYPE. This patch aims to improve the flexibility in using the IIO_ENUM_AVAILABLE define and avoid redefining custom iio enums that expose the shared parameter. An example is the ad5766.c driver where IIO_ENUM_AVAILABLE_SHARED was defined in order to achieve `shared` parameter customization. The current state of the IIO_ENUM_AVAILABLE implementation will imply similar redefinitions each time a driver will require access to the `shared` parameter. An example would be admv1013 driver which will require custom device attribute for the frequency translation modes: Quadrature I/Q mode and Intermediate Frequency mode. Signed-off-by: Antoniu Miclaus Reviewed-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20211119085627.6348-1-antoniu.miclaus@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 324561b7a5e8..07025d6b3de1 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -103,15 +103,16 @@ ssize_t iio_enum_write(struct iio_dev *indio_dev, /** * IIO_ENUM_AVAILABLE() - Initialize enum available extended channel attribute * @_name: Attribute name ("_available" will be appended to the name) + * @_shared: Whether the attribute is shared between all channels * @_e: Pointer to an iio_enum struct * * Creates a read only attribute which lists all the available enum items in a * space separated list. This should usually be used together with IIO_ENUM() */ -#define IIO_ENUM_AVAILABLE(_name, _e) \ +#define IIO_ENUM_AVAILABLE(_name, _shared, _e) \ { \ .name = (_name "_available"), \ - .shared = IIO_SHARED_BY_TYPE, \ + .shared = _shared, \ .read = iio_enum_available_read, \ .private = (uintptr_t)(_e), \ } -- cgit v1.2.3 From 75c5bd68b699bbcb6d25879644d62de4da14ab92 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Fri, 26 Nov 2021 10:48:19 +0100 Subject: ieee80211: change HE nominal packet padding value defines It's easier to use and understand, and to extend for EHT later, if we use the values here instead of the shifted values. Unfortunately, we need to add _POS so that we can use it in places like iwlwifi/mvm where constants are needed. While at it, fix the typo ("NOMIMAL") which also helps catch any conflicts. Signed-off-by: Miri Korenblit Link: https://lore.kernel.org/r/20211126104817.7c29a05b8eb5.I2ca9faf06e177e3035bec91e2ae53c2f91d41774@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 11d7af260f20..559b6c644938 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2258,11 +2258,12 @@ enum ieee80211_client_reg_power { #define IEEE80211_HE_PHY_CAP9_RX_1024_QAM_LESS_THAN_242_TONE_RU 0x08 #define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB 0x10 #define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB 0x20 -#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_0US 0x00 -#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_8US 0x40 -#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_16US 0x80 -#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_RESERVED 0xc0 -#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_MASK 0xc0 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_0US 0x0 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_8US 0x1 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_16US 0x2 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_RESERVED 0x3 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_POS 6 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_MASK 0xc0 #define IEEE80211_HE_PHY_CAP10_HE_MU_M1RU_MAX_LTF 0x01 -- cgit v1.2.3 From 4ba0b2c294fe691921271372f7b59e5cc2ce4b0f Mon Sep 17 00:00:00 2001 From: Russ Weight Date: Thu, 18 Nov 2021 17:55:51 -0800 Subject: fpga: mgr: Use standard dev_release for class driver The FPGA manager class driver data structure is being treated as a managed resource instead of using the standard dev_release call-back function to release the class data structure. This change removes the managed resource code for the freeing of the class data structure and combines the create() and register() functions into a single register() or register_full() function. The register_full() function accepts an info data structure to provide flexibility in passing optional parameters. The register() function supports the current parameter list for users that don't require the use of optional parameters. The devm_fpga_mgr_register() function is retained, and the devm_fpga_mgr_register_full() function is added. Signed-off-by: Russ Weight Reviewed-by: Xu Yilun Acked-by: Xu Yilun Signed-off-by: Moritz Fischer --- include/linux/fpga/fpga-mgr.h | 62 ++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fpga/fpga-mgr.h b/include/linux/fpga/fpga-mgr.h index 474c1f506307..0f9468771bb9 100644 --- a/include/linux/fpga/fpga-mgr.h +++ b/include/linux/fpga/fpga-mgr.h @@ -105,6 +105,36 @@ struct fpga_image_info { #endif }; +/** + * struct fpga_compat_id - id for compatibility check + * + * @id_h: high 64bit of the compat_id + * @id_l: low 64bit of the compat_id + */ +struct fpga_compat_id { + u64 id_h; + u64 id_l; +}; + +/** + * struct fpga_manager_info - collection of parameters for an FPGA Manager + * @name: fpga manager name + * @compat_id: FPGA manager id for compatibility check. + * @mops: pointer to structure of fpga manager ops + * @priv: fpga manager private data + * + * fpga_manager_info contains parameters for the register_full function. + * These are separated into an info structure because they some are optional + * others could be added to in the future. The info structure facilitates + * maintaining a stable API. + */ +struct fpga_manager_info { + const char *name; + struct fpga_compat_id *compat_id; + const struct fpga_manager_ops *mops; + void *priv; +}; + /** * struct fpga_manager_ops - ops for low level fpga manager drivers * @initial_header_size: Maximum number of bytes that should be passed into write_init @@ -143,17 +173,6 @@ struct fpga_manager_ops { #define FPGA_MGR_STATUS_IP_PROTOCOL_ERR BIT(3) #define FPGA_MGR_STATUS_FIFO_OVERFLOW_ERR BIT(4) -/** - * struct fpga_compat_id - id for compatibility check - * - * @id_h: high 64bit of the compat_id - * @id_l: low 64bit of the compat_id - */ -struct fpga_compat_id { - u64 id_h; - u64 id_l; -}; - /** * struct fpga_manager - fpga manager structure * @name: name of low level fpga manager @@ -191,17 +210,18 @@ struct fpga_manager *fpga_mgr_get(struct device *dev); void fpga_mgr_put(struct fpga_manager *mgr); -struct fpga_manager *fpga_mgr_create(struct device *dev, const char *name, - const struct fpga_manager_ops *mops, - void *priv); -void fpga_mgr_free(struct fpga_manager *mgr); -int fpga_mgr_register(struct fpga_manager *mgr); -void fpga_mgr_unregister(struct fpga_manager *mgr); +struct fpga_manager * +fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info); -int devm_fpga_mgr_register(struct device *dev, struct fpga_manager *mgr); +struct fpga_manager * +fpga_mgr_register(struct device *parent, const char *name, + const struct fpga_manager_ops *mops, void *priv); +void fpga_mgr_unregister(struct fpga_manager *mgr); -struct fpga_manager *devm_fpga_mgr_create(struct device *dev, const char *name, - const struct fpga_manager_ops *mops, - void *priv); +struct fpga_manager * +devm_fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info); +struct fpga_manager * +devm_fpga_mgr_register(struct device *parent, const char *name, + const struct fpga_manager_ops *mops, void *priv); #endif /*_LINUX_FPGA_MGR_H */ -- cgit v1.2.3 From 0d70af3c2530a70f1b2c197feaa63fbd3548ce34 Mon Sep 17 00:00:00 2001 From: Russ Weight Date: Thu, 18 Nov 2021 17:55:52 -0800 Subject: fpga: bridge: Use standard dev_release for class driver The FPGA bridge class driver data structure is being treated as a managed resource instead of using the standard dev_release call-back function to release the class data structure. This change removes the managed resource code and combines the create() and register() functions into a single register() function. Signed-off-by: Russ Weight Reviewed-by: Xu Yilun Acked-by: Xu Yilun Signed-off-by: Moritz Fischer --- include/linux/fpga/fpga-bridge.h | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fpga/fpga-bridge.h b/include/linux/fpga/fpga-bridge.h index 6c3c28806ff1..223da48a6d18 100644 --- a/include/linux/fpga/fpga-bridge.h +++ b/include/linux/fpga/fpga-bridge.h @@ -22,6 +22,23 @@ struct fpga_bridge_ops { const struct attribute_group **groups; }; +/** + * struct fpga_bridge_info - collection of parameters an FPGA Bridge + * @name: fpga bridge name + * @br_ops: pointer to structure of fpga bridge ops + * @priv: fpga bridge private data + * + * fpga_bridge_info contains parameters for the register function. These + * are separated into an info structure because they some are optional + * others could be added to in the future. The info structure facilitates + * maintaining a stable API. + */ +struct fpga_bridge_info { + const char *name; + const struct fpga_bridge_ops *br_ops; + void *priv; +}; + /** * struct fpga_bridge - FPGA bridge structure * @name: name of low level FPGA bridge @@ -62,15 +79,10 @@ int of_fpga_bridge_get_to_list(struct device_node *np, struct fpga_image_info *info, struct list_head *bridge_list); -struct fpga_bridge *fpga_bridge_create(struct device *dev, const char *name, - const struct fpga_bridge_ops *br_ops, - void *priv); -void fpga_bridge_free(struct fpga_bridge *br); -int fpga_bridge_register(struct fpga_bridge *br); +struct fpga_bridge * +fpga_bridge_register(struct device *parent, const char *name, + const struct fpga_bridge_ops *br_ops, + void *priv); void fpga_bridge_unregister(struct fpga_bridge *br); -struct fpga_bridge -*devm_fpga_bridge_create(struct device *dev, const char *name, - const struct fpga_bridge_ops *br_ops, void *priv); - #endif /* _LINUX_FPGA_BRIDGE_H */ -- cgit v1.2.3 From 8886a579744fbfa53e69aa453ed10ae3b1f9abac Mon Sep 17 00:00:00 2001 From: Russ Weight Date: Thu, 18 Nov 2021 17:55:53 -0800 Subject: fpga: region: Use standard dev_release for class driver The FPGA region class driver data structure is being treated as a managed resource instead of using the standard dev_release call-back function to release the class data structure. This change removes the managed resource code and combines the create() and register() functions into a single register() or register_full() function. The register_full() function accepts an info data structure to provide flexibility in passing optional parameters. The register() function supports the current parameter list for users that don't require the use of optional parameters. Signed-off-by: Russ Weight Reviewed-by: Xu Yilun Acked-by: Xu Yilun Signed-off-by: Moritz Fischer --- include/linux/fpga/fpga-region.h | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fpga/fpga-region.h b/include/linux/fpga/fpga-region.h index 27cb706275db..3b87f232425c 100644 --- a/include/linux/fpga/fpga-region.h +++ b/include/linux/fpga/fpga-region.h @@ -7,6 +7,27 @@ #include #include +struct fpga_region; + +/** + * struct fpga_region_info - collection of parameters an FPGA Region + * @mgr: fpga region manager + * @compat_id: FPGA region id for compatibility check. + * @priv: fpga region private data + * @get_bridges: optional function to get bridges to a list + * + * fpga_region_info contains parameters for the register_full function. + * These are separated into an info structure because they some are optional + * others could be added to in the future. The info structure facilitates + * maintaining a stable API. + */ +struct fpga_region_info { + struct fpga_manager *mgr; + struct fpga_compat_id *compat_id; + void *priv; + int (*get_bridges)(struct fpga_region *region); +}; + /** * struct fpga_region - FPGA Region structure * @dev: FPGA Region device @@ -37,15 +58,12 @@ struct fpga_region *fpga_region_class_find( int fpga_region_program_fpga(struct fpga_region *region); -struct fpga_region -*fpga_region_create(struct device *dev, struct fpga_manager *mgr, - int (*get_bridges)(struct fpga_region *)); -void fpga_region_free(struct fpga_region *region); -int fpga_region_register(struct fpga_region *region); -void fpga_region_unregister(struct fpga_region *region); +struct fpga_region * +fpga_region_register_full(struct device *parent, const struct fpga_region_info *info); -struct fpga_region -*devm_fpga_region_create(struct device *dev, struct fpga_manager *mgr, - int (*get_bridges)(struct fpga_region *)); +struct fpga_region * +fpga_region_register(struct device *parent, struct fpga_manager *mgr, + int (*get_bridges)(struct fpga_region *)); +void fpga_region_unregister(struct fpga_region *region); #endif /* _FPGA_REGION_H */ -- cgit v1.2.3 From 306456c21c792ac633e660bc45f0854b612a0e98 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Tue, 16 Nov 2021 14:54:35 +0200 Subject: mfd: bd70528: Drop BD70528 support The only known BD70528 use-cases are such that the PMIC is controlled from separate MCU which is not running Linux. I am not aware of any Linux driver users. Furthermore, it seems there is no demand for this IC. Let's ease the maintenance burden and drop the driver. We can always add it back if there is sudden need for it. Signed-off-by: Matti Vaittinen Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/cf7dfd98b3403ad363b2b48b57bdbfd57a6416cb.1637066805.git.matti.vaittinen@fi.rohmeurope.com --- include/linux/mfd/rohm-bd70528.h | 389 --------------------------------------- include/linux/mfd/rohm-generic.h | 1 - 2 files changed, 390 deletions(-) delete mode 100644 include/linux/mfd/rohm-bd70528.h (limited to 'include/linux') diff --git a/include/linux/mfd/rohm-bd70528.h b/include/linux/mfd/rohm-bd70528.h deleted file mode 100644 index 4a5966475a35..000000000000 --- a/include/linux/mfd/rohm-bd70528.h +++ /dev/null @@ -1,389 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* Copyright (C) 2018 ROHM Semiconductors */ - -#ifndef __LINUX_MFD_BD70528_H__ -#define __LINUX_MFD_BD70528_H__ - -#include -#include -#include -#include -#include - -enum { - BD70528_BUCK1, - BD70528_BUCK2, - BD70528_BUCK3, - BD70528_LDO1, - BD70528_LDO2, - BD70528_LDO3, - BD70528_LED1, - BD70528_LED2, -}; - -struct bd70528_data { - struct rohm_regmap_dev chip; - struct mutex rtc_timer_lock; -}; - -#define BD70528_BUCK_VOLTS 0x10 -#define BD70528_LDO_VOLTS 0x20 - -#define BD70528_REG_BUCK1_EN 0x0F -#define BD70528_REG_BUCK1_VOLT 0x15 -#define BD70528_REG_BUCK2_EN 0x10 -#define BD70528_REG_BUCK2_VOLT 0x16 -#define BD70528_REG_BUCK3_EN 0x11 -#define BD70528_REG_BUCK3_VOLT 0x17 -#define BD70528_REG_LDO1_EN 0x1b -#define BD70528_REG_LDO1_VOLT 0x1e -#define BD70528_REG_LDO2_EN 0x1c -#define BD70528_REG_LDO2_VOLT 0x1f -#define BD70528_REG_LDO3_EN 0x1d -#define BD70528_REG_LDO3_VOLT 0x20 -#define BD70528_REG_LED_CTRL 0x2b -#define BD70528_REG_LED_VOLT 0x29 -#define BD70528_REG_LED_EN 0x2a - -/* main irq registers */ -#define BD70528_REG_INT_MAIN 0x7E -#define BD70528_REG_INT_MAIN_MASK 0x74 - -/* 'sub irq' registers */ -#define BD70528_REG_INT_SHDN 0x7F -#define BD70528_REG_INT_PWR_FLT 0x80 -#define BD70528_REG_INT_VR_FLT 0x81 -#define BD70528_REG_INT_MISC 0x82 -#define BD70528_REG_INT_BAT1 0x83 -#define BD70528_REG_INT_BAT2 0x84 -#define BD70528_REG_INT_RTC 0x85 -#define BD70528_REG_INT_GPIO 0x86 -#define BD70528_REG_INT_OP_FAIL 0x87 - -#define BD70528_REG_INT_SHDN_MASK 0x75 -#define BD70528_REG_INT_PWR_FLT_MASK 0x76 -#define BD70528_REG_INT_VR_FLT_MASK 0x77 -#define BD70528_REG_INT_MISC_MASK 0x78 -#define BD70528_REG_INT_BAT1_MASK 0x79 -#define BD70528_REG_INT_BAT2_MASK 0x7a -#define BD70528_REG_INT_RTC_MASK 0x7b -#define BD70528_REG_INT_GPIO_MASK 0x7c -#define BD70528_REG_INT_OP_FAIL_MASK 0x7d - -/* Reset related 'magic' registers */ -#define BD70528_REG_SHIPMODE 0x03 -#define BD70528_REG_HWRESET 0x04 -#define BD70528_REG_WARMRESET 0x05 -#define BD70528_REG_STANDBY 0x06 - -/* GPIO registers */ -#define BD70528_REG_GPIO_STATE 0x8F - -#define BD70528_REG_GPIO1_IN 0x4d -#define BD70528_REG_GPIO2_IN 0x4f -#define BD70528_REG_GPIO3_IN 0x51 -#define BD70528_REG_GPIO4_IN 0x53 -#define BD70528_REG_GPIO1_OUT 0x4e -#define BD70528_REG_GPIO2_OUT 0x50 -#define BD70528_REG_GPIO3_OUT 0x52 -#define BD70528_REG_GPIO4_OUT 0x54 - -/* RTC */ - -#define BD70528_REG_RTC_COUNT_H 0x2d -#define BD70528_REG_RTC_COUNT_L 0x2e -#define BD70528_REG_RTC_SEC 0x2f -#define BD70528_REG_RTC_MINUTE 0x30 -#define BD70528_REG_RTC_HOUR 0x31 -#define BD70528_REG_RTC_WEEK 0x32 -#define BD70528_REG_RTC_DAY 0x33 -#define BD70528_REG_RTC_MONTH 0x34 -#define BD70528_REG_RTC_YEAR 0x35 - -#define BD70528_REG_RTC_ALM_SEC 0x36 -#define BD70528_REG_RTC_ALM_START BD70528_REG_RTC_ALM_SEC -#define BD70528_REG_RTC_ALM_MINUTE 0x37 -#define BD70528_REG_RTC_ALM_HOUR 0x38 -#define BD70528_REG_RTC_ALM_WEEK 0x39 -#define BD70528_REG_RTC_ALM_DAY 0x3a -#define BD70528_REG_RTC_ALM_MONTH 0x3b -#define BD70528_REG_RTC_ALM_YEAR 0x3c -#define BD70528_REG_RTC_ALM_MASK 0x3d -#define BD70528_REG_RTC_ALM_REPEAT 0x3e -#define BD70528_REG_RTC_START BD70528_REG_RTC_SEC - -#define BD70528_REG_RTC_WAKE_SEC 0x43 -#define BD70528_REG_RTC_WAKE_START BD70528_REG_RTC_WAKE_SEC -#define BD70528_REG_RTC_WAKE_MIN 0x44 -#define BD70528_REG_RTC_WAKE_HOUR 0x45 -#define BD70528_REG_RTC_WAKE_CTRL 0x46 - -#define BD70528_REG_ELAPSED_TIMER_EN 0x42 -#define BD70528_REG_WAKE_EN 0x46 - -/* WDT registers */ -#define BD70528_REG_WDT_CTRL 0x4A -#define BD70528_REG_WDT_HOUR 0x49 -#define BD70528_REG_WDT_MINUTE 0x48 -#define BD70528_REG_WDT_SEC 0x47 - -/* Charger / Battery */ -#define BD70528_REG_CHG_CURR_STAT 0x59 -#define BD70528_REG_CHG_BAT_STAT 0x57 -#define BD70528_REG_CHG_BAT_TEMP 0x58 -#define BD70528_REG_CHG_IN_STAT 0x56 -#define BD70528_REG_CHG_DCIN_ILIM 0x5d -#define BD70528_REG_CHG_CHG_CURR_WARM 0x61 -#define BD70528_REG_CHG_CHG_CURR_COLD 0x62 - -/* Masks for main IRQ register bits */ -enum { - BD70528_INT_SHDN, -#define BD70528_INT_SHDN_MASK BIT(BD70528_INT_SHDN) - BD70528_INT_PWR_FLT, -#define BD70528_INT_PWR_FLT_MASK BIT(BD70528_INT_PWR_FLT) - BD70528_INT_VR_FLT, -#define BD70528_INT_VR_FLT_MASK BIT(BD70528_INT_VR_FLT) - BD70528_INT_MISC, -#define BD70528_INT_MISC_MASK BIT(BD70528_INT_MISC) - BD70528_INT_BAT1, -#define BD70528_INT_BAT1_MASK BIT(BD70528_INT_BAT1) - BD70528_INT_RTC, -#define BD70528_INT_RTC_MASK BIT(BD70528_INT_RTC) - BD70528_INT_GPIO, -#define BD70528_INT_GPIO_MASK BIT(BD70528_INT_GPIO) - BD70528_INT_OP_FAIL, -#define BD70528_INT_OP_FAIL_MASK BIT(BD70528_INT_OP_FAIL) -}; - -/* IRQs */ -enum { - /* Shutdown register IRQs */ - BD70528_INT_LONGPUSH, - BD70528_INT_WDT, - BD70528_INT_HWRESET, - BD70528_INT_RSTB_FAULT, - BD70528_INT_VBAT_UVLO, - BD70528_INT_TSD, - BD70528_INT_RSTIN, - /* Power failure register IRQs */ - BD70528_INT_BUCK1_FAULT, - BD70528_INT_BUCK2_FAULT, - BD70528_INT_BUCK3_FAULT, - BD70528_INT_LDO1_FAULT, - BD70528_INT_LDO2_FAULT, - BD70528_INT_LDO3_FAULT, - BD70528_INT_LED1_FAULT, - BD70528_INT_LED2_FAULT, - /* VR FAULT register IRQs */ - BD70528_INT_BUCK1_OCP, - BD70528_INT_BUCK2_OCP, - BD70528_INT_BUCK3_OCP, - BD70528_INT_LED1_OCP, - BD70528_INT_LED2_OCP, - BD70528_INT_BUCK1_FULLON, - BD70528_INT_BUCK2_FULLON, - /* PMU register interrupts */ - BD70528_INT_SHORTPUSH, - BD70528_INT_AUTO_WAKEUP, - BD70528_INT_STATE_CHANGE, - /* Charger 1 register IRQs */ - BD70528_INT_BAT_OV_RES, - BD70528_INT_BAT_OV_DET, - BD70528_INT_DBAT_DET, - BD70528_INT_BATTSD_COLD_RES, - BD70528_INT_BATTSD_COLD_DET, - BD70528_INT_BATTSD_HOT_RES, - BD70528_INT_BATTSD_HOT_DET, - BD70528_INT_CHG_TSD, - /* Charger 2 register IRQs */ - BD70528_INT_BAT_RMV, - BD70528_INT_BAT_DET, - BD70528_INT_DCIN2_OV_RES, - BD70528_INT_DCIN2_OV_DET, - BD70528_INT_DCIN2_RMV, - BD70528_INT_DCIN2_DET, - BD70528_INT_DCIN1_RMV, - BD70528_INT_DCIN1_DET, - /* RTC register IRQs */ - BD70528_INT_RTC_ALARM, - BD70528_INT_ELPS_TIM, - /* GPIO register IRQs */ - BD70528_INT_GPIO0, - BD70528_INT_GPIO1, - BD70528_INT_GPIO2, - BD70528_INT_GPIO3, - /* Invalid operation register IRQs */ - BD70528_INT_BUCK1_DVS_OPFAIL, - BD70528_INT_BUCK2_DVS_OPFAIL, - BD70528_INT_BUCK3_DVS_OPFAIL, - BD70528_INT_LED1_VOLT_OPFAIL, - BD70528_INT_LED2_VOLT_OPFAIL, -}; - -/* Masks */ -#define BD70528_INT_LONGPUSH_MASK 0x1 -#define BD70528_INT_WDT_MASK 0x2 -#define BD70528_INT_HWRESET_MASK 0x4 -#define BD70528_INT_RSTB_FAULT_MASK 0x8 -#define BD70528_INT_VBAT_UVLO_MASK 0x10 -#define BD70528_INT_TSD_MASK 0x20 -#define BD70528_INT_RSTIN_MASK 0x40 - -#define BD70528_INT_BUCK1_FAULT_MASK 0x1 -#define BD70528_INT_BUCK2_FAULT_MASK 0x2 -#define BD70528_INT_BUCK3_FAULT_MASK 0x4 -#define BD70528_INT_LDO1_FAULT_MASK 0x8 -#define BD70528_INT_LDO2_FAULT_MASK 0x10 -#define BD70528_INT_LDO3_FAULT_MASK 0x20 -#define BD70528_INT_LED1_FAULT_MASK 0x40 -#define BD70528_INT_LED2_FAULT_MASK 0x80 - -#define BD70528_INT_BUCK1_OCP_MASK 0x1 -#define BD70528_INT_BUCK2_OCP_MASK 0x2 -#define BD70528_INT_BUCK3_OCP_MASK 0x4 -#define BD70528_INT_LED1_OCP_MASK 0x8 -#define BD70528_INT_LED2_OCP_MASK 0x10 -#define BD70528_INT_BUCK1_FULLON_MASK 0x20 -#define BD70528_INT_BUCK2_FULLON_MASK 0x40 - -#define BD70528_INT_SHORTPUSH_MASK 0x1 -#define BD70528_INT_AUTO_WAKEUP_MASK 0x2 -#define BD70528_INT_STATE_CHANGE_MASK 0x10 - -#define BD70528_INT_BAT_OV_RES_MASK 0x1 -#define BD70528_INT_BAT_OV_DET_MASK 0x2 -#define BD70528_INT_DBAT_DET_MASK 0x4 -#define BD70528_INT_BATTSD_COLD_RES_MASK 0x8 -#define BD70528_INT_BATTSD_COLD_DET_MASK 0x10 -#define BD70528_INT_BATTSD_HOT_RES_MASK 0x20 -#define BD70528_INT_BATTSD_HOT_DET_MASK 0x40 -#define BD70528_INT_CHG_TSD_MASK 0x80 - -#define BD70528_INT_BAT_RMV_MASK 0x1 -#define BD70528_INT_BAT_DET_MASK 0x2 -#define BD70528_INT_DCIN2_OV_RES_MASK 0x4 -#define BD70528_INT_DCIN2_OV_DET_MASK 0x8 -#define BD70528_INT_DCIN2_RMV_MASK 0x10 -#define BD70528_INT_DCIN2_DET_MASK 0x20 -#define BD70528_INT_DCIN1_RMV_MASK 0x40 -#define BD70528_INT_DCIN1_DET_MASK 0x80 - -#define BD70528_INT_RTC_ALARM_MASK 0x1 -#define BD70528_INT_ELPS_TIM_MASK 0x2 - -#define BD70528_INT_GPIO0_MASK 0x1 -#define BD70528_INT_GPIO1_MASK 0x2 -#define BD70528_INT_GPIO2_MASK 0x4 -#define BD70528_INT_GPIO3_MASK 0x8 - -#define BD70528_INT_BUCK1_DVS_OPFAIL_MASK 0x1 -#define BD70528_INT_BUCK2_DVS_OPFAIL_MASK 0x2 -#define BD70528_INT_BUCK3_DVS_OPFAIL_MASK 0x4 -#define BD70528_INT_LED1_VOLT_OPFAIL_MASK 0x10 -#define BD70528_INT_LED2_VOLT_OPFAIL_MASK 0x20 - -#define BD70528_DEBOUNCE_MASK 0x3 - -#define BD70528_DEBOUNCE_DISABLE 0 -#define BD70528_DEBOUNCE_15MS 1 -#define BD70528_DEBOUNCE_30MS 2 -#define BD70528_DEBOUNCE_50MS 3 - -#define BD70528_GPIO_DRIVE_MASK 0x2 -#define BD70528_GPIO_PUSH_PULL 0x0 -#define BD70528_GPIO_OPEN_DRAIN 0x2 - -#define BD70528_GPIO_OUT_EN_MASK 0x80 -#define BD70528_GPIO_OUT_ENABLE 0x80 -#define BD70528_GPIO_OUT_DISABLE 0x0 - -#define BD70528_GPIO_OUT_HI 0x1 -#define BD70528_GPIO_OUT_LO 0x0 -#define BD70528_GPIO_OUT_MASK 0x1 - -#define BD70528_GPIO_IN_STATE_BASE 1 - -/* RTC masks to mask out reserved bits */ - -#define BD70528_MASK_ELAPSED_TIMER_EN 0x1 -/* Mask second, min and hour fields - * HW would support ALM irq for over 24h - * (by setting day, month and year too) - * but as we wish to keep this same as for - * wake-up we limit ALM to 24H and only - * unmask sec, min and hour - */ -#define BD70528_MASK_WAKE_EN 0x1 - -/* WDT masks */ -#define BD70528_MASK_WDT_EN 0x1 -#define BD70528_MASK_WDT_HOUR 0x1 -#define BD70528_MASK_WDT_MINUTE 0x7f -#define BD70528_MASK_WDT_SEC 0x7f - -#define BD70528_WDT_STATE_BIT 0x1 -#define BD70528_ELAPSED_STATE_BIT 0x2 -#define BD70528_WAKE_STATE_BIT 0x4 - -/* Charger masks */ -#define BD70528_MASK_CHG_STAT 0x7f -#define BD70528_MASK_CHG_BAT_TIMER 0x20 -#define BD70528_MASK_CHG_BAT_OVERVOLT 0x10 -#define BD70528_MASK_CHG_BAT_DETECT 0x1 -#define BD70528_MASK_CHG_DCIN1_UVLO 0x1 -#define BD70528_MASK_CHG_DCIN_ILIM 0x3f -#define BD70528_MASK_CHG_CHG_CURR 0x1f -#define BD70528_MASK_CHG_TRICKLE_CURR 0x10 - -/* - * Note, external battery register is the lonely rider at - * address 0xc5. See how to stuff that in the regmap - */ -#define BD70528_MAX_REGISTER 0x94 - -/* Buck control masks */ -#define BD70528_MASK_RUN_EN 0x4 -#define BD70528_MASK_STBY_EN 0x2 -#define BD70528_MASK_IDLE_EN 0x1 -#define BD70528_MASK_LED1_EN 0x1 -#define BD70528_MASK_LED2_EN 0x10 - -#define BD70528_MASK_BUCK_VOLT 0xf -#define BD70528_MASK_LDO_VOLT 0x1f -#define BD70528_MASK_LED1_VOLT 0x1 -#define BD70528_MASK_LED2_VOLT 0x10 - -/* Misc irq masks */ -#define BD70528_INT_MASK_SHORT_PUSH 1 -#define BD70528_INT_MASK_AUTO_WAKE 2 -#define BD70528_INT_MASK_POWER_STATE 4 - -#define BD70528_MASK_BUCK_RAMP 0x10 -#define BD70528_SIFT_BUCK_RAMP 4 - -#if IS_ENABLED(CONFIG_BD70528_WATCHDOG) - -int bd70528_wdt_set(struct rohm_regmap_dev *data, int enable, int *old_state); -void bd70528_wdt_lock(struct rohm_regmap_dev *data); -void bd70528_wdt_unlock(struct rohm_regmap_dev *data); - -#else /* CONFIG_BD70528_WATCHDOG */ - -static inline int bd70528_wdt_set(struct rohm_regmap_dev *data, int enable, - int *old_state) -{ - return 0; -} - -static inline void bd70528_wdt_lock(struct rohm_regmap_dev *data) -{ -} - -static inline void bd70528_wdt_unlock(struct rohm_regmap_dev *data) -{ -} - -#endif /* CONFIG_BD70528_WATCHDOG */ - -#endif /* __LINUX_MFD_BD70528_H__ */ diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h index 35b392a0d73a..8fb763a2265a 100644 --- a/include/linux/mfd/rohm-generic.h +++ b/include/linux/mfd/rohm-generic.h @@ -12,7 +12,6 @@ enum rohm_chip_type { ROHM_CHIP_TYPE_BD9573, ROHM_CHIP_TYPE_BD9574, ROHM_CHIP_TYPE_BD9576, - ROHM_CHIP_TYPE_BD70528, ROHM_CHIP_TYPE_BD71815, ROHM_CHIP_TYPE_BD71828, ROHM_CHIP_TYPE_BD71837, -- cgit v1.2.3 From a9c8f68ce2c37ced2f7a8667eda71b7753ede398 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 23 Nov 2021 21:27:22 +0200 Subject: spi: pxa2xx: Get rid of unused ->cs_control() Since the last user of the custom ->cs_control() gone, we may get rid of this legacy API completely. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211123192723.44537-2-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/spi/pxa2xx_spi.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h index eaab121ee575..42e06bfbc2a4 100644 --- a/include/linux/spi/pxa2xx_spi.h +++ b/include/linux/spi/pxa2xx_spi.h @@ -9,9 +9,6 @@ #include -#define PXA2XX_CS_ASSERT (0x01) -#define PXA2XX_CS_DEASSERT (0x02) - struct dma_chan; /* @@ -47,7 +44,6 @@ struct pxa2xx_spi_chip { u32 timeout; u8 enable_loopback; int gpio_cs; - void (*cs_control)(u32 command); }; #if defined(CONFIG_ARCH_PXA) || defined(CONFIG_ARCH_MMP) -- cgit v1.2.3 From 8393961c53b31078cfc877bc00eb0f67e1474edd Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 23 Nov 2021 21:27:23 +0200 Subject: spi: pxa2xx: Get rid of unused enable_loopback member There is no user of the enable_loopback member in the struct pxa2xx_spi_chip. Remote this legacy member completely. The mentioned in the documentation the testing phase can be performed with spidev_test tool. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211123192723.44537-3-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/spi/pxa2xx_spi.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h index 42e06bfbc2a4..ca74dce36706 100644 --- a/include/linux/spi/pxa2xx_spi.h +++ b/include/linux/spi/pxa2xx_spi.h @@ -42,7 +42,6 @@ struct pxa2xx_spi_chip { u8 rx_threshold; u8 dma_burst_size; u32 timeout; - u8 enable_loopback; int gpio_cs; }; -- cgit v1.2.3 From b99658452355d316debee11079e8f1c6c1029355 Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Sun, 28 Nov 2021 17:57:37 -0800 Subject: net: dsa: ocelot: felix: utilize shared mscc-miim driver for indirect MDIO access Switch to a shared MDIO access implementation by way of the mdio-mscc-miim driver. Signed-off-by: Colin Foster Tested-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/mdio/mdio-mscc-miim.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/linux/mdio/mdio-mscc-miim.h (limited to 'include/linux') diff --git a/include/linux/mdio/mdio-mscc-miim.h b/include/linux/mdio/mdio-mscc-miim.h new file mode 100644 index 000000000000..5b4ed2c3cbb9 --- /dev/null +++ b/include/linux/mdio/mdio-mscc-miim.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ +/* + * Driver for the MDIO interface of Microsemi network switches. + * + * Author: Colin Foster + * Copyright (C) 2021 Innovative Advantage + */ +#ifndef MDIO_MSCC_MIIM_H +#define MDIO_MSCC_MIIM_H + +#include +#include +#include + +int mscc_miim_setup(struct device *device, struct mii_bus **bus, + const char *name, struct regmap *mii_regmap, + int status_offset); + +#endif -- cgit v1.2.3 From 17247821ae9b40ea6df8d771cfca97d91675be93 Mon Sep 17 00:00:00 2001 From: Dario Binacchi Date: Thu, 25 Nov 2021 23:46:42 +0100 Subject: mfd: ti_am335x_tscadc: Drop the CNTRLREG_TSC_8WIRE macro In TI's reference manual description for the `AFE_Pen_Ctrl' bit-field of the TSC's CTRL register, there is no mention of 8-wire touchscreens. Even commit f0933a60d190 ("mfd: ti_am335x_tscadc: Update logic in CTRL register for 5-wire TS") says that the value of this bit-field must be the same for 4-wire and 8-wire touchscreens. So let's remove the CNTRLREG_TSC_8WIRE macro to avoid misunderstandings. Signed-off-by: Dario Binacchi Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211125224642.21011-5-dariobin@libero.it --- include/linux/mfd/ti_am335x_tscadc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h index ba13e043d910..4063b0614d90 100644 --- a/include/linux/mfd/ti_am335x_tscadc.h +++ b/include/linux/mfd/ti_am335x_tscadc.h @@ -103,7 +103,6 @@ #define CNTRLREG_TSC_AFE_CTRL(val) FIELD_PREP(GENMASK(6, 5), (val)) #define CNTRLREG_TSC_4WIRE CNTRLREG_TSC_AFE_CTRL(1) #define CNTRLREG_TSC_5WIRE CNTRLREG_TSC_AFE_CTRL(2) -#define CNTRLREG_TSC_8WIRE CNTRLREG_TSC_AFE_CTRL(3) #define CNTRLREG_TSC_ENB BIT(7) /*Control registers bitfields for MAGADC IP */ -- cgit v1.2.3 From 79478bf9ea9fa48d30836afa796ac13d8a0f320b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 Nov 2021 07:13:54 +0100 Subject: block: move blk_rq_err_bytes to scsi blk_rq_err_bytes is only used by the scsi midlayer, so move it there. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20211117061404.331732-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2949d9ac7484..a78d9a0f2a1b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -947,7 +947,6 @@ struct req_iterator { * blk_rq_pos() : the current sector * blk_rq_bytes() : bytes left in the entire request * blk_rq_cur_bytes() : bytes left in the current segment - * blk_rq_err_bytes() : bytes left till the next error boundary * blk_rq_sectors() : sectors left in the entire request * blk_rq_cur_sectors() : sectors left in the current segment * blk_rq_stats_sectors() : sectors of the entire request used for stats @@ -971,8 +970,6 @@ static inline int blk_rq_cur_bytes(const struct request *rq) return bio_iovec(rq->bio).bv_len; } -unsigned int blk_rq_err_bytes(const struct request *rq); - static inline unsigned int blk_rq_sectors(const struct request *rq) { return blk_rq_bytes(rq) >> SECTOR_SHIFT; -- cgit v1.2.3 From 786d4e01c550e8bb7c9f9f23bca0596a2a33483c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 Nov 2021 07:13:55 +0100 Subject: block: remove rq_flush_dcache_pages This function is trivial, and flush_dcache_page is always defined, so just open code it in the 2.5 callers. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20211117061404.331732-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a78d9a0f2a1b..308edc2a4925 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1132,14 +1132,4 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq) } #endif /* CONFIG_BLK_DEV_ZONED */ -#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" -#endif -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -void rq_flush_dcache_pages(struct request *rq); -#else -static inline void rq_flush_dcache_pages(struct request *rq) -{ -} -#endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */ #endif /* BLK_MQ_H */ -- cgit v1.2.3 From 86416916466514e4ae0b7296d20133b6427c4c1f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 22 Nov 2021 14:06:12 +0100 Subject: block: move GENHD_FL_NATIVE_CAPACITY to disk->state The flag to indicate an unlocked native capacity is dynamic state, not a driver capability flag, so move it to disk->state. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211122130625.1136848-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/genhd.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 74c410263113..e490a71e5e9d 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -60,12 +60,6 @@ struct partition_meta_info { * (``BLOCK_EXT_MAJOR``). * This affects the maximum number of partitions. * - * ``GENHD_FL_NATIVE_CAPACITY`` (0x0080): based on information in the - * partition table, the device's capacity has been extended to its - * native capacity; i.e. the device has hidden capacity used by one - * of the partitions (this is a flag used so that native capacity is - * only ever unlocked once). - * * ``GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE`` (0x0100): event polling is * blocked whenever a writer holds an exclusive lock. * @@ -86,7 +80,6 @@ struct partition_meta_info { #define GENHD_FL_CD 0x0008 #define GENHD_FL_SUPPRESS_PARTITION_INFO 0x0020 #define GENHD_FL_EXT_DEVT 0x0040 -#define GENHD_FL_NATIVE_CAPACITY 0x0080 #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 0x0100 #define GENHD_FL_NO_PART_SCAN 0x0200 #define GENHD_FL_HIDDEN 0x0400 @@ -140,6 +133,7 @@ struct gendisk { #define GD_NEED_PART_SCAN 0 #define GD_READ_ONLY 1 #define GD_DEAD 2 +#define GD_NATIVE_CAPACITY 3 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ -- cgit v1.2.3 From 1545e0b419ba1d9b9bee4061d4826340afe6b0aa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 22 Nov 2021 14:06:13 +0100 Subject: block: move GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE to disk->event_flags GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE is all about the event reporting mechanism, so move it to the event_flags field. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211122130625.1136848-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/genhd.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index e490a71e5e9d..c1136ff3c91f 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -60,9 +60,6 @@ struct partition_meta_info { * (``BLOCK_EXT_MAJOR``). * This affects the maximum number of partitions. * - * ``GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE`` (0x0100): event polling is - * blocked whenever a writer holds an exclusive lock. - * * ``GENHD_FL_NO_PART_SCAN`` (0x0200): partition scanning is disabled. * Used for loop devices in their default settings and some MMC * devices. @@ -80,7 +77,6 @@ struct partition_meta_info { #define GENHD_FL_CD 0x0008 #define GENHD_FL_SUPPRESS_PARTITION_INFO 0x0020 #define GENHD_FL_EXT_DEVT 0x0040 -#define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 0x0100 #define GENHD_FL_NO_PART_SCAN 0x0200 #define GENHD_FL_HIDDEN 0x0400 @@ -94,6 +90,8 @@ enum { DISK_EVENT_FLAG_POLL = 1 << 0, /* Forward events to udev */ DISK_EVENT_FLAG_UEVENT = 1 << 1, + /* Block event polling when open for exclusive write */ + DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE = 1 << 2, }; struct disk_events; -- cgit v1.2.3 From 1a827ce1b9f2c740d2c6a228afd972970c18bc21 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 22 Nov 2021 14:06:14 +0100 Subject: block: remove GENHD_FL_CD GENHD_FL_CD marks a gendisk as a vaguely CD-ROM like device. Besides being used internally inside of sunvdc.c an xen-blkfront it is used by xen-blkback as a hint to claim a device exported to a guest is a CD-ROM like device. Just check for disk->cdi instead which is the right indicator for "real" CD-ROM or DVD drivers. This will miss the paravirtualized guest drivers, but those make little sense to report anyway. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211122130625.1136848-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/genhd.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index c1136ff3c91f..74518c576fbb 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -46,10 +46,6 @@ struct partition_meta_info { * Must not be set for devices which are removed entirely when the * media is removed. * - * ``GENHD_FL_CD`` (0x0008): the block device is a CD-ROM-style - * device. - * Affects responses to the ``CDROM_GET_CAPABILITY`` ioctl. - * * ``GENHD_FL_SUPPRESS_PARTITION_INFO`` (0x0020): don't include * partition information in ``/proc/partitions`` or in the output of * printk_all_partitions(). @@ -74,7 +70,6 @@ struct partition_meta_info { #define GENHD_FL_REMOVABLE 0x0001 /* 2 is unused (used to be GENHD_FL_DRIVERFS) */ /* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */ -#define GENHD_FL_CD 0x0008 #define GENHD_FL_SUPPRESS_PARTITION_INFO 0x0020 #define GENHD_FL_EXT_DEVT 0x0040 #define GENHD_FL_NO_PART_SCAN 0x0200 -- cgit v1.2.3 From 46e7eac647b34ed4106a8262f8bedbb90801fadd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 22 Nov 2021 14:06:17 +0100 Subject: block: rename GENHD_FL_NO_PART_SCAN to GENHD_FL_NO_PART The GENHD_FL_NO_PART_SCAN controls more than just partitions canning, so rename it to GENHD_FL_NO_PART. Signed-off-by: Christoph Hellwig Acked-by: Ulf Hansson Link: https://lore.kernel.org/r/20211122130625.1136848-7-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/genhd.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 74518c576fbb..0b9be3df9489 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -56,15 +56,15 @@ struct partition_meta_info { * (``BLOCK_EXT_MAJOR``). * This affects the maximum number of partitions. * - * ``GENHD_FL_NO_PART_SCAN`` (0x0200): partition scanning is disabled. - * Used for loop devices in their default settings and some MMC - * devices. + * ``GENHD_FL_NO_PART`` (0x0200): partition support is disabled. + * The kernel will not scan for partitions from add_disk, and users + * can't add partitions manually. * * ``GENHD_FL_HIDDEN`` (0x0400): the block device is hidden; it * doesn't produce events, doesn't appear in sysfs, and doesn't have * an associated ``bdev``. * Implies ``GENHD_FL_SUPPRESS_PARTITION_INFO`` and - * ``GENHD_FL_NO_PART_SCAN``. + * ``GENHD_FL_NO_PART``. * Used for multipath devices. */ #define GENHD_FL_REMOVABLE 0x0001 @@ -72,7 +72,7 @@ struct partition_meta_info { /* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */ #define GENHD_FL_SUPPRESS_PARTITION_INFO 0x0020 #define GENHD_FL_EXT_DEVT 0x0040 -#define GENHD_FL_NO_PART_SCAN 0x0200 +#define GENHD_FL_NO_PART 0x0200 #define GENHD_FL_HIDDEN 0x0400 enum { @@ -180,8 +180,7 @@ static inline int disk_max_parts(struct gendisk *disk) static inline bool disk_part_scan_enabled(struct gendisk *disk) { - return disk_max_parts(disk) > 1 && - !(disk->flags & GENHD_FL_NO_PART_SCAN); + return disk_max_parts(disk) > 1 && !(disk->flags & GENHD_FL_NO_PART); } static inline dev_t disk_devt(struct gendisk *disk) -- cgit v1.2.3 From 3b5149ac50970669ee0ddb9629ec77ffd5c0622d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 22 Nov 2021 14:06:21 +0100 Subject: block: remove GENHD_FL_SUPPRESS_PARTITION_INFO This flag is not set directly anywhere and only inherited from GENHD_FL_HIDDEN. Just check for GENHD_FL_HIDDEN instead. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211122130625.1136848-11-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/genhd.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 0b9be3df9489..64a2f33ae9ea 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -46,11 +46,6 @@ struct partition_meta_info { * Must not be set for devices which are removed entirely when the * media is removed. * - * ``GENHD_FL_SUPPRESS_PARTITION_INFO`` (0x0020): don't include - * partition information in ``/proc/partitions`` or in the output of - * printk_all_partitions(). - * Used for the null block device and some MMC devices. - * * ``GENHD_FL_EXT_DEVT`` (0x0040): the driver supports extended * dynamic ``dev_t``, i.e. it wants extended device numbers * (``BLOCK_EXT_MAJOR``). @@ -63,14 +58,12 @@ struct partition_meta_info { * ``GENHD_FL_HIDDEN`` (0x0400): the block device is hidden; it * doesn't produce events, doesn't appear in sysfs, and doesn't have * an associated ``bdev``. - * Implies ``GENHD_FL_SUPPRESS_PARTITION_INFO`` and - * ``GENHD_FL_NO_PART``. + * Implies ``GENHD_FL_NO_PART``. * Used for multipath devices. */ #define GENHD_FL_REMOVABLE 0x0001 /* 2 is unused (used to be GENHD_FL_DRIVERFS) */ /* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */ -#define GENHD_FL_SUPPRESS_PARTITION_INFO 0x0020 #define GENHD_FL_EXT_DEVT 0x0040 #define GENHD_FL_NO_PART 0x0200 #define GENHD_FL_HIDDEN 0x0400 -- cgit v1.2.3 From 1ebe2e5f9d68e94c524aba876f27b945669a7879 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 22 Nov 2021 14:06:22 +0100 Subject: block: remove GENHD_FL_EXT_DEVT All modern drivers can support extra partitions using the extended dev_t. In fact except for the ioctl method drivers never even see partitions in normal operation. So remove the GENHD_FL_EXT_DEVT and allow extra partitions for all block devices that do support partitions, and require those that do not support partitions to explicit disallow them using GENHD_FL_NO_PART. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211122130625.1136848-12-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/genhd.h | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 64a2f33ae9ea..b8ced80178d6 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -46,11 +46,6 @@ struct partition_meta_info { * Must not be set for devices which are removed entirely when the * media is removed. * - * ``GENHD_FL_EXT_DEVT`` (0x0040): the driver supports extended - * dynamic ``dev_t``, i.e. it wants extended device numbers - * (``BLOCK_EXT_MAJOR``). - * This affects the maximum number of partitions. - * * ``GENHD_FL_NO_PART`` (0x0200): partition support is disabled. * The kernel will not scan for partitions from add_disk, and users * can't add partitions manually. @@ -64,7 +59,6 @@ struct partition_meta_info { #define GENHD_FL_REMOVABLE 0x0001 /* 2 is unused (used to be GENHD_FL_DRIVERFS) */ /* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */ -#define GENHD_FL_EXT_DEVT 0x0040 #define GENHD_FL_NO_PART 0x0200 #define GENHD_FL_HIDDEN 0x0400 @@ -94,13 +88,13 @@ struct blk_integrity { }; struct gendisk { - /* major, first_minor and minors are input parameters only, - * don't use directly. Use disk_devt() and disk_max_parts(). + /* + * major/first_minor/minors should not be set by any new driver, the + * block core will take care of allocating them automatically. */ - int major; /* major number of driver */ + int major; int first_minor; - int minors; /* maximum number of minors, =1 for - * disks that can't be partitioned. */ + int minors; char disk_name[DISK_NAME_LEN]; /* name of major driver */ @@ -164,18 +158,6 @@ static inline bool disk_live(struct gendisk *disk) #define disk_to_cdi(disk) NULL #endif -static inline int disk_max_parts(struct gendisk *disk) -{ - if (disk->flags & GENHD_FL_EXT_DEVT) - return DISK_MAX_PARTS; - return disk->minors; -} - -static inline bool disk_part_scan_enabled(struct gendisk *disk) -{ - return disk_max_parts(disk) > 1 && !(disk->flags & GENHD_FL_NO_PART); -} - static inline dev_t disk_devt(struct gendisk *disk) { return MKDEV(disk->major, disk->first_minor); -- cgit v1.2.3 From 430cc5d3ab4d0ba0bd011cfbb0035e46ba92920c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 22 Nov 2021 14:06:24 +0100 Subject: block: cleanup the GENHD_FL_* definitions Switch to an enum and tidy up the documentation. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211122130625.1136848-14-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/genhd.h | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index b8ced80178d6..6906a45bc761 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -39,28 +39,24 @@ struct partition_meta_info { /** * DOC: genhd capability flags * - * ``GENHD_FL_REMOVABLE`` (0x0001): indicates that the block device - * gives access to removable media. - * When set, the device remains present even when media is not - * inserted. - * Must not be set for devices which are removed entirely when the + * ``GENHD_FL_REMOVABLE``: indicates that the block device gives access to + * removable media. When set, the device remains present even when media is not + * inserted. Shall not be set for devices which are removed entirely when the * media is removed. * - * ``GENHD_FL_NO_PART`` (0x0200): partition support is disabled. - * The kernel will not scan for partitions from add_disk, and users - * can't add partitions manually. + * ``GENHD_FL_HIDDEN``: the block device is hidden; it doesn't produce events, + * doesn't appear in sysfs, and can't be opened from userspace or using + * blkdev_get*. Used for the underlying components of multipath devices. + * + * ``GENHD_FL_NO_PART``: partition support is disabled. The kernel will not + * scan for partitions from add_disk, and users can't add partitions manually. * - * ``GENHD_FL_HIDDEN`` (0x0400): the block device is hidden; it - * doesn't produce events, doesn't appear in sysfs, and doesn't have - * an associated ``bdev``. - * Implies ``GENHD_FL_NO_PART``. - * Used for multipath devices. */ -#define GENHD_FL_REMOVABLE 0x0001 -/* 2 is unused (used to be GENHD_FL_DRIVERFS) */ -/* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */ -#define GENHD_FL_NO_PART 0x0200 -#define GENHD_FL_HIDDEN 0x0400 +enum { + GENHD_FL_REMOVABLE = 1 << 0, + GENHD_FL_HIDDEN = 1 << 1, + GENHD_FL_NO_PART = 1 << 2, +}; enum { DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ -- cgit v1.2.3 From 48b5c1fbcd8c5bc6b91a56399a5257b801391dd8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 13 Nov 2021 14:03:26 -0700 Subject: block: only allocate poll_stats if there's a user of them This is essentially never used, yet it's about 1/3rd of the total queue size. Allocate it when needed, and don't embed it in the queue. Kill the queue flag for this while at it, since we can just check the assigned pointer now. Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bd4370baccca..74118e67f649 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -267,7 +267,7 @@ struct request_queue { int poll_nsec; struct blk_stat_callback *poll_cb; - struct blk_rq_stat poll_stat[BLK_MQ_POLL_STATS_BKTS]; + struct blk_rq_stat *poll_stat; struct timer_list timeout; struct work_struct timeout_work; @@ -397,7 +397,6 @@ struct request_queue { #define QUEUE_FLAG_FUA 18 /* device supports FUA writes */ #define QUEUE_FLAG_DAX 19 /* device supports DAX */ #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ -#define QUEUE_FLAG_POLL_STATS 21 /* collecting stats for hybrid polling */ #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ #define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ -- cgit v1.2.3 From 72cd9df2ef788d88c138d51223a01ca6281f232d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 23 Nov 2021 17:37:33 -0800 Subject: blk-crypto: remove blk_crypto_unregister() This function is trivial and is only used in one place. Having this function is misleading because it implies that blk_crypto_register() needs to be paired with blk_crypto_unregister(), which is not the case. Just set disk->queue->crypto_profile to NULL directly. Signed-off-by: Eric Biggers Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211124013733.347612-1-ebiggers@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 74118e67f649..0a4416ef4fbf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1170,8 +1170,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned lo bool blk_crypto_register(struct blk_crypto_profile *profile, struct request_queue *q); -void blk_crypto_unregister(struct request_queue *q); - #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline bool blk_crypto_register(struct blk_crypto_profile *profile, @@ -1180,8 +1178,6 @@ static inline bool blk_crypto_register(struct blk_crypto_profile *profile, return true; } -static inline void blk_crypto_unregister(struct request_queue *q) { } - #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ enum blk_unique_id { -- cgit v1.2.3 From e8dc17e2893b4107366004810ca2a4acf1fc8563 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 25 Oct 2021 09:06:57 +0200 Subject: blk-mq: Add blk_mq_complete_request_direct() Add blk_mq_complete_request_direct() which completes the block request directly instead deferring it to softirq for single queue devices. This is useful for devices which complete the requests in preemptible context and raising softirq from means scheduling ksoftirqd. Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211025070658.1565848-2-bigeasy@linutronix.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 308edc2a4925..d952c3442261 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -752,6 +752,17 @@ static inline void blk_mq_set_request_complete(struct request *rq) WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); } +/* + * Complete the request directly instead of deferring it to softirq or + * completing it another CPU. Useful in preemptible instead of an interrupt. + */ +static inline void blk_mq_complete_request_direct(struct request *rq, + void (*complete)(struct request *rq)) +{ + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); + complete(rq); +} + void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); -- cgit v1.2.3 From 88c9a2ce520ba381bb70658c80ec704f4d60f728 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 Nov 2021 12:58:05 +0100 Subject: fork: move copy_io to block/blk-ioc.c Move the copying of the I/O context to the block layer as that is where we can use the proper low-level interfaces. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211126115817.2087431-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/iocontext.h | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 0a9dc40b7be8..bcd47d104d8e 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -129,14 +129,6 @@ static inline void get_io_context_active(struct io_context *ioc) atomic_inc(&ioc->active_ref); } -static inline void ioc_task_link(struct io_context *ioc) -{ - get_io_context_active(ioc); - - WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0); - atomic_inc(&ioc->nr_tasks); -} - struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); @@ -144,10 +136,21 @@ void put_io_context_active(struct io_context *ioc); void exit_io_context(struct task_struct *task); struct io_context *get_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node); +int __copy_io(unsigned long clone_flags, struct task_struct *tsk); +static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +{ + if (!current->io_context) + return 0; + return __copy_io(clone_flags, tsk); +} #else struct io_context; static inline void put_io_context(struct io_context *ioc) { } static inline void exit_io_context(struct task_struct *task) { } -#endif +static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +{ + return 0; +} +#endif /* CONFIG_BLOCK */ -#endif +#endif /* IOCONTEXT_H */ -- cgit v1.2.3 From 3304742562d27fb87a6d8291cc48824dd20f6964 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 Nov 2021 12:58:09 +0100 Subject: block: mark put_io_context_active static Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211126115817.2087431-7-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/iocontext.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index bcd47d104d8e..3ba45953d522 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -132,7 +132,6 @@ static inline void get_io_context_active(struct io_context *ioc) struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); -void put_io_context_active(struct io_context *ioc); void exit_io_context(struct task_struct *task); struct io_context *get_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node); -- cgit v1.2.3 From 50569c24be61eafb3efa06e2a3ccd447f75ae1b0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 Nov 2021 12:58:12 +0100 Subject: block: remove get_io_context_active Fold it into it's only caller, and remove a lof of the debug checks that are not needed. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211126115817.2087431-10-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/iocontext.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 3ba45953d522..c1229fbd6691 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -113,22 +113,6 @@ struct io_context { struct work_struct release_work; }; -/** - * get_io_context_active - get active reference on ioc - * @ioc: ioc of interest - * - * Only iocs with active reference can issue new IOs. This function - * acquires an active reference on @ioc. The caller must already have an - * active reference on @ioc. - */ -static inline void get_io_context_active(struct io_context *ioc) -{ - WARN_ON_ONCE(atomic_long_read(&ioc->refcount) <= 0); - WARN_ON_ONCE(atomic_read(&ioc->active_ref) <= 0); - atomic_long_inc(&ioc->refcount); - atomic_inc(&ioc->active_ref); -} - struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); -- cgit v1.2.3 From f3fa33acca9f0058157214800f68b10d8e71ab7a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 Nov 2021 13:18:00 +0100 Subject: block: remove the ->rq_disk field in struct request Just use the disk attached to the request_queue instead. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20211126121802.2090656-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d952c3442261..ede7bef8880a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -100,7 +100,6 @@ struct request { struct request *rq_next; }; - struct gendisk *rq_disk; struct block_device *part; #ifdef CONFIG_BLK_RQ_ALLOC_TIME /* Time that the first bio started allocating this request. */ @@ -890,9 +889,6 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; rq->ioprio = bio_prio(bio); - - if (bio->bi_bdev) - rq->rq_disk = bio->bi_bdev->bd_disk; } void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, -- cgit v1.2.3 From b84ba30b6c7a75babdf73b83bc3c7b59b944501a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 Nov 2021 13:18:01 +0100 Subject: block: remove the gendisk argument to blk_execute_rq Remove the gendisk aregument to blk_execute_rq and blk_execute_rq_nowait given that it is unused now. Also convert the boolean at_head parameter to actually use the bool type while touching the prototype. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20211126121802.2090656-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ede7bef8880a..1b87b7c8bbff 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -924,10 +924,9 @@ int blk_rq_unmap_user(struct bio *); int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t); int blk_rq_append_bio(struct request *rq, struct bio *bio); -void blk_execute_rq_nowait(struct gendisk *, struct request *, int, - rq_end_io_fn *); -blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq, - int at_head); +void blk_execute_rq_nowait(struct request *rq, bool at_head, + rq_end_io_fn *end_io); +blk_status_t blk_execute_rq(struct request *rq, bool at_head); struct req_iterator { struct bvec_iter iter; -- cgit v1.2.3 From f7e5b9bfa6c8820407b64eabc1f29c9a87e8993d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 29 Nov 2021 10:39:29 -0500 Subject: siphash: use _unaligned version by default On ARM v6 and later, we define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS because the ordinary load/store instructions (ldr, ldrh, ldrb) can tolerate any misalignment of the memory address. However, load/store double and load/store multiple instructions (ldrd, ldm) may still only be used on memory addresses that are 32-bit aligned, and so we have to use the CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS macro with care, or we may end up with a severe performance hit due to alignment traps that require fixups by the kernel. Testing shows that this currently happens with clang-13 but not gcc-11. In theory, any compiler version can produce this bug or other problems, as we are dealing with undefined behavior in C99 even on architectures that support this in hardware, see also https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100363. Fortunately, the get_unaligned() accessors do the right thing: when building for ARMv6 or later, the compiler will emit unaligned accesses using the ordinary load/store instructions (but avoid the ones that require 32-bit alignment). When building for older ARM, those accessors will emit the appropriate sequence of ldrb/mov/orr instructions. And on architectures that can truly tolerate any kind of misalignment, the get_unaligned() accessors resolve to the leXX_to_cpup accessors that operate on aligned addresses. Since the compiler will in fact emit ldrd or ldm instructions when building this code for ARM v6 or later, the solution is to use the unaligned accessors unconditionally on architectures where this is known to be fast. The _aligned version of the hash function is however still needed to get the best performance on architectures that cannot do any unaligned access in hardware. This new version avoids the undefined behavior and should produce the fastest hash on all architectures we support. Link: https://lore.kernel.org/linux-arm-kernel/20181008211554.5355-4-ard.biesheuvel@linaro.org/ Link: https://lore.kernel.org/linux-crypto/CAK8P3a2KfmmGDbVHULWevB0hv71P2oi2ZCHEAqT=8dQfa0=cqQ@mail.gmail.com/ Reported-by: Ard Biesheuvel Fixes: 2c956a60778c ("siphash: add cryptographically secure PRF") Signed-off-by: Arnd Bergmann Reviewed-by: Jason A. Donenfeld Acked-by: Ard Biesheuvel Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- include/linux/siphash.h | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/siphash.h b/include/linux/siphash.h index bf21591a9e5e..0cda61855d90 100644 --- a/include/linux/siphash.h +++ b/include/linux/siphash.h @@ -27,9 +27,7 @@ static inline bool siphash_key_is_zero(const siphash_key_t *key) } u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key); -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key); -#endif u64 siphash_1u64(const u64 a, const siphash_key_t *key); u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key); @@ -82,10 +80,9 @@ static inline u64 ___siphash_aligned(const __le64 *data, size_t len, static inline u64 siphash(const void *data, size_t len, const siphash_key_t *key) { -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - if (!IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT)) + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || + !IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT)) return __siphash_unaligned(data, len, key); -#endif return ___siphash_aligned(data, len, key); } @@ -96,10 +93,8 @@ typedef struct { u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key); -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key); -#endif u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key); u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key); @@ -135,10 +130,9 @@ static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len, static inline u32 hsiphash(const void *data, size_t len, const hsiphash_key_t *key) { -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - if (!IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT)) + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || + !IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT)) return __hsiphash_unaligned(data, len, key); -#endif return ___hsiphash_aligned(data, len, key); } -- cgit v1.2.3 From 4047b9db1aa7512a10ba3560a3f63821c8c40235 Mon Sep 17 00:00:00 2001 From: Bhupesh Sharma Date: Mon, 29 Nov 2021 01:28:54 +0530 Subject: net: stmmac: Add platform level debug register dump feature dwmac-qcom-ethqos currently exposes a mechanism to dump rgmii registers after the 'stmmac_dvr_probe()' returns. However with commit 5ec55823438e ("net: stmmac: add clocks management for gmac driver"), we now let 'pm_runtime_put()' disable the clocks before returning from 'stmmac_dvr_probe()'. This causes a crash when 'rgmii_dump()' register dumps are enabled, as the clocks are already off. Since other dwmac drivers (possible future users as well) might require a similar register dump feature, introduce a platform level callback to allow the same. This fixes the crash noticed while enabling rgmii_dump() dumps in dwmac-qcom-ethqos driver as well. It also allows future changes to keep a invoking the register dump callback from the correct place inside 'stmmac_dvr_probe()'. Fixes: 5ec55823438e ("net: stmmac: add clocks management for gmac driver") Cc: Joakim Zhang Cc: David S. Miller Signed-off-by: Bhupesh Sharma Signed-off-by: David S. Miller --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 89b8e208cd7b..24eea1b05ca2 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -233,6 +233,7 @@ struct plat_stmmacenet_data { int (*clks_config)(void *priv, bool enabled); int (*crosststamp)(ktime_t *device, struct system_counterval_t *system, void *ctx); + void (*dump_debug_regs)(void *priv); void *bsp_priv; struct clk *stmmac_clk; struct clk *pclk; -- cgit v1.2.3 From 79364031c5b4365ca28ac0fa00acfab5bf465be1 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sat, 27 Nov 2021 17:32:00 +0100 Subject: bpf: Make sure bpf_disable_instrumentation() is safe vs preemption. The initial implementation of migrate_disable() for mainline was a wrapper around preempt_disable(). RT kernels substituted this with a real migrate disable implementation. Later on mainline gained true migrate disable support, but neither documentation nor affected code were updated. Remove stale comments claiming that migrate_disable() is PREEMPT_RT only. Don't use __this_cpu_inc() in the !PREEMPT_RT path because preemption is not disabled and the RMW operation can be preempted. Fixes: 74d862b682f51 ("sched: Make migrate_disable/enable() independent of RT") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211127163200.10466-3-bigeasy@linutronix.de --- include/linux/bpf.h | 16 ++-------------- include/linux/filter.h | 3 --- 2 files changed, 2 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 84ff6ef49462..755f38e893be 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1353,28 +1353,16 @@ extern struct mutex bpf_stats_enabled_mutex; * kprobes, tracepoints) to prevent deadlocks on map operations as any of * these events can happen inside a region which holds a map bucket lock * and can deadlock on it. - * - * Use the preemption safe inc/dec variants on RT because migrate disable - * is preemptible on RT and preemption in the middle of the RMW operation - * might lead to inconsistent state. Use the raw variants for non RT - * kernels as migrate_disable() maps to preempt_disable() so the slightly - * more expensive save operation can be avoided. */ static inline void bpf_disable_instrumentation(void) { migrate_disable(); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - this_cpu_inc(bpf_prog_active); - else - __this_cpu_inc(bpf_prog_active); + this_cpu_inc(bpf_prog_active); } static inline void bpf_enable_instrumentation(void) { - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - this_cpu_dec(bpf_prog_active); - else - __this_cpu_dec(bpf_prog_active); + this_cpu_dec(bpf_prog_active); migrate_enable(); } diff --git a/include/linux/filter.h b/include/linux/filter.h index 24b7ed2677af..534f678ca50f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -640,9 +640,6 @@ static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void * This uses migrate_disable/enable() explicitly to document that the * invocation of a BPF program does not require reentrancy protection * against a BPF program which is invoked from a preempting task. - * - * For non RT enabled kernels migrate_disable/enable() maps to - * preempt_disable/enable(), i.e. it disables also preemption. */ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog, const void *ctx) -- cgit v1.2.3 From e6f2dd0f80674e9d5960337b3e9c2a242441b326 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 29 Nov 2021 19:06:19 -0800 Subject: bpf: Add bpf_loop helper This patch adds the kernel-side and API changes for a new helper function, bpf_loop: long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags); where long (*callback_fn)(u32 index, void *ctx); bpf_loop invokes the "callback_fn" **nr_loops** times or until the callback_fn returns 1. The callback_fn can only return 0 or 1, and this is enforced by the verifier. The callback_fn index is zero-indexed. A few things to please note: ~ The "u64 flags" parameter is currently unused but is included in case a future use case for it arises. ~ In the kernel-side implementation of bpf_loop (kernel/bpf/bpf_iter.c), bpf_callback_t is used as the callback function cast. ~ A program can have nested bpf_loop calls but the program must still adhere to the verifier constraint of its stack depth (the stack depth cannot exceed MAX_BPF_STACK)) ~ Recursive callback_fns do not pass the verifier, due to the call stack for these being too deep. ~ The next patch will include the tests and benchmark Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211130030622.4131246-2-joannekoong@fb.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cc7a0c36e7df..cad0829710be 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2164,6 +2164,7 @@ extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto; extern const struct bpf_func_proto bpf_find_vma_proto; +extern const struct bpf_func_proto bpf_loop_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); -- cgit v1.2.3 From 7ad639840acf2800b5f387c495795f995a67a329 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 29 Nov 2021 13:06:43 +0000 Subject: thread_info: Add helpers to snapshot thread flags In there are helpers to manipulate individual thread flags, but where code wants to check several flags at once, it must open code reading current_thread_info()->flags and operating on a snapshot. As some flags can be set remotely it's necessary to use READ_ONCE() to get a consistent snapshot even when IRQs are disabled, but some code forgets to do this. Generally this is unlike to cause a problem in practice, but it is somewhat unsound, and KCSAN will legitimately warn that there is a data race. To make it easier to do the right thing, and to highlight that concurrent modification is possible, add new helpers to snapshot the flags, which should be used in preference to plain reads. Subsequent patches will move existing code to use the new helpers. Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Acked-by: Marco Elver Acked-by: Paul E. McKenney Cc: Boqun Feng Cc: Dmitry Vyukov Cc: Peter Zijlstra Cc: Will Deacon Link: https://lore.kernel.org/r/20211129130653.2037928-2-mark.rutland@arm.com --- include/linux/thread_info.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index ad0c4e041030..73a6f34b3847 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -118,6 +118,15 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag) return test_bit(flag, (unsigned long *)&ti->flags); } +/* + * This may be used in noinstr code, and needs to be __always_inline to prevent + * inadvertent instrumentation. + */ +static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti) +{ + return READ_ONCE(ti->flags); +} + #define set_thread_flag(flag) \ set_ti_thread_flag(current_thread_info(), flag) #define clear_thread_flag(flag) \ @@ -130,6 +139,11 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag) test_and_clear_ti_thread_flag(current_thread_info(), flag) #define test_thread_flag(flag) \ test_ti_thread_flag(current_thread_info(), flag) +#define read_thread_flags() \ + read_ti_thread_flags(current_thread_info()) + +#define read_task_thread_flags(t) \ + read_ti_thread_flags(task_thread_info(t)) #ifdef CONFIG_GENERIC_ENTRY #define set_syscall_work(fl) \ -- cgit v1.2.3 From 6ce895128b3bff738fe8d9dd74747a03e319e466 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 29 Nov 2021 13:06:44 +0000 Subject: entry: Snapshot thread flags Some thread flags can be set remotely, and so even when IRQs are disabled, the flags can change under our feet. Generally this is unlikely to cause a problem in practice, but it is somewhat unsound, and KCSAN will legitimately warn that there is a data race. To avoid such issues, a snapshot of the flags has to be taken prior to using them. Some places already use READ_ONCE() for that, others do not. Convert them all to the new flag accessor helpers. Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20211129130653.2037928-3-mark.rutland@arm.com --- include/linux/entry-kvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h index 0d7865a0731c..07c878d6e323 100644 --- a/include/linux/entry-kvm.h +++ b/include/linux/entry-kvm.h @@ -75,7 +75,7 @@ static inline void xfer_to_guest_mode_prepare(void) */ static inline bool __xfer_to_guest_mode_work_pending(void) { - unsigned long ti_work = READ_ONCE(current_thread_info()->flags); + unsigned long ti_work = read_thread_flags(); return !!(ti_work & XFER_TO_GUEST_MODE_WORK); } -- cgit v1.2.3 From 4946f15e8c334840bf277a0bf924371eae120fcd Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Tue, 30 Nov 2021 22:40:43 +0100 Subject: genirq/generic_chip: Constify irq_generic_chip_ops The only usage of irq_generic_chip_ops is to pass its address to irq_domain_add_linear() which takes a pointer to const struct irq_domain_ops. Make it const to allow the compiler to put it in read-only memory. [ tglx: Fixed subject prefix ] Signed-off-by: Rikard Falkeborn Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20211130214043.1257585-1-rikard.falkeborn@gmail.com --- include/linux/irqdomain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 553da4899f55..d476405802e9 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -131,7 +131,7 @@ struct irq_domain_ops { #endif }; -extern struct irq_domain_ops irq_generic_chip_ops; +extern const struct irq_domain_ops irq_generic_chip_ops; struct irq_domain_chip_generic; -- cgit v1.2.3 From 24ba53017e188e031f9cb8b290286fad52d2af00 Mon Sep 17 00:00:00 2001 From: Chun-Hung Tseng Date: Wed, 15 Sep 2021 17:02:18 +0800 Subject: rcu: Replace ________p1 and _________p1 with __UNIQUE_ID(rcu) This commit replaces both ________p1 and _________p1 with __UNIQUE_ID(rcu), and also adjusts the callers of the affected macros. __UNIQUE_ID(rcu) will generate unique variable names during compilation, which eliminates the need of ________p1 and _________p1 (both having 4 occurrences prior to the code change). This also avoids the variable name shadowing issue, or at least makes those wishing to cause shadowing problems work much harder to do so. The same idea is used for the min/max macros (commit 589a978 and commit e9092d0). Signed-off-by: Jim Huang Signed-off-by: Chun-Hung Tseng Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 50 +++++++++++++++++++++++++++--------------------- include/linux/srcu.h | 3 ++- 2 files changed, 30 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 5e0beb5c5659..88b42eb46406 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -364,6 +364,12 @@ static inline void rcu_preempt_sleep_check(void) { } #define rcu_check_sparse(p, space) #endif /* #else #ifdef __CHECKER__ */ +#define __unrcu_pointer(p, local) \ +({ \ + typeof(*p) *local = (typeof(*p) *__force)(p); \ + rcu_check_sparse(p, __rcu); \ + ((typeof(*p) __force __kernel *)(local)); \ +}) /** * unrcu_pointer - mark a pointer as not being RCU protected * @p: pointer needing to lose its __rcu property @@ -371,39 +377,35 @@ static inline void rcu_preempt_sleep_check(void) { } * Converts @p from an __rcu pointer to a __kernel pointer. * This allows an __rcu pointer to be used with xchg() and friends. */ -#define unrcu_pointer(p) \ -({ \ - typeof(*p) *_________p1 = (typeof(*p) *__force)(p); \ - rcu_check_sparse(p, __rcu); \ - ((typeof(*p) __force __kernel *)(_________p1)); \ -}) +#define unrcu_pointer(p) __unrcu_pointer(p, __UNIQUE_ID(rcu)) -#define __rcu_access_pointer(p, space) \ +#define __rcu_access_pointer(p, local, space) \ ({ \ - typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \ + typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \ rcu_check_sparse(p, space); \ - ((typeof(*p) __force __kernel *)(_________p1)); \ + ((typeof(*p) __force __kernel *)(local)); \ }) -#define __rcu_dereference_check(p, c, space) \ +#define __rcu_dereference_check(p, local, c, space) \ ({ \ /* Dependency order vs. p above. */ \ - typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \ + typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ rcu_check_sparse(p, space); \ - ((typeof(*p) __force __kernel *)(________p1)); \ + ((typeof(*p) __force __kernel *)(local)); \ }) -#define __rcu_dereference_protected(p, c, space) \ +#define __rcu_dereference_protected(p, local, c, space) \ ({ \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(p)); \ }) -#define rcu_dereference_raw(p) \ +#define __rcu_dereference_raw(p, local) \ ({ \ /* Dependency order vs. p above. */ \ - typeof(p) ________p1 = READ_ONCE(p); \ - ((typeof(*p) __force __kernel *)(________p1)); \ + typeof(p) local = READ_ONCE(p); \ + ((typeof(*p) __force __kernel *)(local)); \ }) +#define rcu_dereference_raw(p) __rcu_dereference_raw(p, __UNIQUE_ID(rcu)) /** * RCU_INITIALIZER() - statically initialize an RCU-protected global variable @@ -490,7 +492,7 @@ do { \ * when tearing down multi-linked structures after a grace period * has elapsed. */ -#define rcu_access_pointer(p) __rcu_access_pointer((p), __rcu) +#define rcu_access_pointer(p) __rcu_access_pointer((p), __UNIQUE_ID(rcu), __rcu) /** * rcu_dereference_check() - rcu_dereference with debug checking @@ -526,7 +528,8 @@ do { \ * annotated as __rcu. */ #define rcu_dereference_check(p, c) \ - __rcu_dereference_check((p), (c) || rcu_read_lock_held(), __rcu) + __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ + (c) || rcu_read_lock_held(), __rcu) /** * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking @@ -541,7 +544,8 @@ do { \ * rcu_read_lock() but also rcu_read_lock_bh() into account. */ #define rcu_dereference_bh_check(p, c) \ - __rcu_dereference_check((p), (c) || rcu_read_lock_bh_held(), __rcu) + __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ + (c) || rcu_read_lock_bh_held(), __rcu) /** * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking @@ -556,7 +560,8 @@ do { \ * only rcu_read_lock() but also rcu_read_lock_sched() into account. */ #define rcu_dereference_sched_check(p, c) \ - __rcu_dereference_check((p), (c) || rcu_read_lock_sched_held(), \ + __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ + (c) || rcu_read_lock_sched_held(), \ __rcu) /* @@ -566,7 +571,8 @@ do { \ * The no-tracing version of rcu_dereference_raw() must not call * rcu_read_lock_held(). */ -#define rcu_dereference_raw_check(p) __rcu_dereference_check((p), 1, __rcu) +#define rcu_dereference_raw_check(p) \ + __rcu_dereference_check((p), __UNIQUE_ID(rcu), 1, __rcu) /** * rcu_dereference_protected() - fetch RCU pointer when updates prevented @@ -585,7 +591,7 @@ do { \ * but very ugly failures. */ #define rcu_dereference_protected(p, c) \ - __rcu_dereference_protected((p), (c), __rcu) + __rcu_dereference_protected((p), __UNIQUE_ID(rcu), (c), __rcu) /** diff --git a/include/linux/srcu.h b/include/linux/srcu.h index e6011a9975af..01226e4d960a 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -117,7 +117,8 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) * lockdep_is_held() calls. */ #define srcu_dereference_check(p, ssp, c) \ - __rcu_dereference_check((p), (c) || srcu_read_lock_held(ssp), __rcu) + __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ + (c) || srcu_read_lock_held(ssp), __rcu) /** * srcu_dereference - fetch SRCU-protected pointer for later dereferencing -- cgit v1.2.3 From 2407a64f8045552203ee5cb9904ce75ce2fceef4 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 28 Sep 2021 08:21:28 +0800 Subject: rcu: in_irq() cleanup This commit replaces the obsolete and ambiguous macro in_irq() with its shiny new in_hardirq() equivalent. Signed-off-by: Changbin Du Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 9be015305f9f..858f4d429946 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -85,7 +85,7 @@ static inline void rcu_irq_enter_irqson(void) { } static inline void rcu_irq_exit(void) { } static inline void rcu_irq_exit_check_preempt(void) { } #define rcu_is_idle_cpu(cpu) \ - (is_idle_task(current) && !in_nmi() && !in_irq() && !in_serving_softirq()) + (is_idle_task(current) && !in_nmi() && !in_hardirq() && !in_serving_softirq()) static inline void exit_rcu(void) { } static inline bool rcu_preempt_need_deferred_qs(struct task_struct *t) { -- cgit v1.2.3 From 502e82b91361955c66c8453b5b7a905b0b5bd5a1 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 7 Nov 2021 17:21:45 +0200 Subject: net/mlx5: Fix access to a non-supported register Validate MRTC register is supported before triggering a delayed work which accesses it. Fixes: 5a1023deeed0 ("net/mlx5: Add periodic update of host time to firmware") Signed-off-by: Aya Levin Reviewed-by: Gal Pressman Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3636df90899a..fbaab440a484 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9698,7 +9698,10 @@ struct mlx5_ifc_mcam_access_reg_bits { u8 regs_84_to_68[0x11]; u8 tracer_registers[0x4]; - u8 regs_63_to_32[0x20]; + u8 regs_63_to_46[0x12]; + u8 mrtc[0x1]; + u8 regs_44_to_32[0xd]; + u8 regs_31_to_0[0x20]; }; -- cgit v1.2.3 From af3bf054661fb11497a7f712ece8b838521227a4 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 1 Dec 2021 01:17:36 +0000 Subject: cgroup: fix a typo in comment In commit 8699b7762a62 ("cgroup: s/child_subsys_mask/subtree_ss_mask/"), we rename child_subsys_mask to subtree_ss_mask. While it missed to rename this in comment. Signed-off-by: Wei Yang Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index db2e147e069f..bb1e79791ed5 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -413,7 +413,7 @@ struct cgroup { /* * The bitmask of subsystems enabled on the child cgroups. * ->subtree_control is the one configured through - * "cgroup.subtree_control" while ->child_ss_mask is the effective + * "cgroup.subtree_control" while ->subtree_ss_mask is the effective * one which may have more subsystems enabled. Controller knobs * are made available iff it's enabled in ->subtree_control. */ -- cgit v1.2.3 From 6bbfa44116689469267f1a6e3d233b52114139d2 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 1 Dec 2021 23:45:50 +0900 Subject: kprobes: Limit max data_size of the kretprobe instances The 'kprobe::data_size' is unsigned, thus it can not be negative. But if user sets it enough big number (e.g. (size_t)-8), the result of 'data_size + sizeof(struct kretprobe_instance)' becomes smaller than sizeof(struct kretprobe_instance) or zero. In result, the kretprobe_instance are allocated without enough memory, and kretprobe accesses outside of allocated memory. To avoid this issue, introduce a max limitation of the kretprobe::data_size. 4KB per instance should be OK. Link: https://lkml.kernel.org/r/163836995040.432120.10322772773821182925.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: f47cd9b553aa ("kprobes: kretprobe user entry-handler") Reported-by: zhangyue Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index e974caf39d3e..8c8f7a4d93af 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -153,6 +153,8 @@ struct kretprobe { struct kretprobe_holder *rph; }; +#define KRETPROBE_MAX_DATA_SIZE 4096 + struct kretprobe_instance { union { struct freelist_node freelist; -- cgit v1.2.3 From 7a10d8c810cfad3e79372d7d1c77899d86cd6662 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Nov 2021 09:01:55 -0800 Subject: net: annotate data-races on txq->xmit_lock_owner syzbot found that __dev_queue_xmit() is reading txq->xmit_lock_owner without annotations. No serious issue there, let's document what is happening there. BUG: KCSAN: data-race in __dev_queue_xmit / __dev_queue_xmit write to 0xffff888139d09484 of 4 bytes by interrupt on cpu 0: __netif_tx_unlock include/linux/netdevice.h:4437 [inline] __dev_queue_xmit+0x948/0xf70 net/core/dev.c:4229 dev_queue_xmit_accel+0x19/0x20 net/core/dev.c:4265 macvlan_queue_xmit drivers/net/macvlan.c:543 [inline] macvlan_start_xmit+0x2b3/0x3d0 drivers/net/macvlan.c:567 __netdev_start_xmit include/linux/netdevice.h:4987 [inline] netdev_start_xmit include/linux/netdevice.h:5001 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3590 dev_hard_start_xmit+0x72/0x120 net/core/dev.c:3606 sch_direct_xmit+0x1b2/0x7c0 net/sched/sch_generic.c:342 __dev_xmit_skb+0x83d/0x1370 net/core/dev.c:3817 __dev_queue_xmit+0x590/0xf70 net/core/dev.c:4194 dev_queue_xmit+0x13/0x20 net/core/dev.c:4259 neigh_hh_output include/net/neighbour.h:511 [inline] neigh_output include/net/neighbour.h:525 [inline] ip6_finish_output2+0x995/0xbb0 net/ipv6/ip6_output.c:126 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] ip6_finish_output+0x444/0x4c0 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x10e/0x210 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:450 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0x486/0x610 net/ipv6/ndisc.c:508 ndisc_send_rs+0x3b0/0x3e0 net/ipv6/ndisc.c:702 addrconf_rs_timer+0x370/0x540 net/ipv6/addrconf.c:3898 call_timer_fn+0x2e/0x240 kernel/time/timer.c:1421 expire_timers+0x116/0x240 kernel/time/timer.c:1466 __run_timers+0x368/0x410 kernel/time/timer.c:1734 run_timer_softirq+0x2e/0x60 kernel/time/timer.c:1747 __do_softirq+0x158/0x2de kernel/softirq.c:558 __irq_exit_rcu kernel/softirq.c:636 [inline] irq_exit_rcu+0x37/0x70 kernel/softirq.c:648 sysvec_apic_timer_interrupt+0x3e/0xb0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 read to 0xffff888139d09484 of 4 bytes by interrupt on cpu 1: __dev_queue_xmit+0x5e3/0xf70 net/core/dev.c:4213 dev_queue_xmit_accel+0x19/0x20 net/core/dev.c:4265 macvlan_queue_xmit drivers/net/macvlan.c:543 [inline] macvlan_start_xmit+0x2b3/0x3d0 drivers/net/macvlan.c:567 __netdev_start_xmit include/linux/netdevice.h:4987 [inline] netdev_start_xmit include/linux/netdevice.h:5001 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3590 dev_hard_start_xmit+0x72/0x120 net/core/dev.c:3606 sch_direct_xmit+0x1b2/0x7c0 net/sched/sch_generic.c:342 __dev_xmit_skb+0x83d/0x1370 net/core/dev.c:3817 __dev_queue_xmit+0x590/0xf70 net/core/dev.c:4194 dev_queue_xmit+0x13/0x20 net/core/dev.c:4259 neigh_resolve_output+0x3db/0x410 net/core/neighbour.c:1523 neigh_output include/net/neighbour.h:527 [inline] ip6_finish_output2+0x9be/0xbb0 net/ipv6/ip6_output.c:126 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] ip6_finish_output+0x444/0x4c0 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x10e/0x210 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:450 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0x486/0x610 net/ipv6/ndisc.c:508 ndisc_send_rs+0x3b0/0x3e0 net/ipv6/ndisc.c:702 addrconf_rs_timer+0x370/0x540 net/ipv6/addrconf.c:3898 call_timer_fn+0x2e/0x240 kernel/time/timer.c:1421 expire_timers+0x116/0x240 kernel/time/timer.c:1466 __run_timers+0x368/0x410 kernel/time/timer.c:1734 run_timer_softirq+0x2e/0x60 kernel/time/timer.c:1747 __do_softirq+0x158/0x2de kernel/softirq.c:558 __irq_exit_rcu kernel/softirq.c:636 [inline] irq_exit_rcu+0x37/0x70 kernel/softirq.c:648 sysvec_apic_timer_interrupt+0x8d/0xb0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 kcsan_setup_watchpoint+0x94/0x420 kernel/kcsan/core.c:443 folio_test_anon include/linux/page-flags.h:581 [inline] PageAnon include/linux/page-flags.h:586 [inline] zap_pte_range+0x5ac/0x10e0 mm/memory.c:1347 zap_pmd_range mm/memory.c:1467 [inline] zap_pud_range mm/memory.c:1496 [inline] zap_p4d_range mm/memory.c:1517 [inline] unmap_page_range+0x2dc/0x3d0 mm/memory.c:1538 unmap_single_vma+0x157/0x210 mm/memory.c:1583 unmap_vmas+0xd0/0x180 mm/memory.c:1615 exit_mmap+0x23d/0x470 mm/mmap.c:3170 __mmput+0x27/0x1b0 kernel/fork.c:1113 mmput+0x3d/0x50 kernel/fork.c:1134 exit_mm+0xdb/0x170 kernel/exit.c:507 do_exit+0x608/0x17a0 kernel/exit.c:819 do_group_exit+0xce/0x180 kernel/exit.c:929 get_signal+0xfc3/0x1550 kernel/signal.c:2852 arch_do_signal_or_restart+0x8c/0x2e0 arch/x86/kernel/signal.c:868 handle_signal_work kernel/entry/common.c:148 [inline] exit_to_user_mode_loop kernel/entry/common.c:172 [inline] exit_to_user_mode_prepare+0x113/0x190 kernel/entry/common.c:207 __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline] syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:300 do_syscall_64+0x50/0xd0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000000 -> 0xffffffff Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 28712 Comm: syz-executor.0 Tainted: G W 5.16.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20211130170155.2331929-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ec42495a43a..be5cb3360b94 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4404,7 +4404,8 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) { spin_lock(&txq->_xmit_lock); - txq->xmit_lock_owner = cpu; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, cpu); } static inline bool __netif_tx_acquire(struct netdev_queue *txq) @@ -4421,26 +4422,32 @@ static inline void __netif_tx_release(struct netdev_queue *txq) static inline void __netif_tx_lock_bh(struct netdev_queue *txq) { spin_lock_bh(&txq->_xmit_lock); - txq->xmit_lock_owner = smp_processor_id(); + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); } static inline bool __netif_tx_trylock(struct netdev_queue *txq) { bool ok = spin_trylock(&txq->_xmit_lock); - if (likely(ok)) - txq->xmit_lock_owner = smp_processor_id(); + + if (likely(ok)) { + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); + } return ok; } static inline void __netif_tx_unlock(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock(&txq->_xmit_lock); } static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock_bh(&txq->_xmit_lock); } -- cgit v1.2.3 From 57b2b72ac1fc5d55cf3b13207942c109f1a65cb5 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 29 Nov 2021 09:57:59 -0700 Subject: mm, slab: Remove compiler check in __kmalloc_index The minimum supported version of LLVM has been raised to 11.0.0, meaning this check is always true, so it can be dropped. Signed-off-by: Nathan Chancellor Reviewed-by: Miguel Ojeda Reviewed-by: Mark Brown Reviewed-by: Nick Desaulniers Reviewed-by: Kees Cook Signed-off-by: Masahiro Yamada --- include/linux/slab.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 181045148b06..d3fb5ac71c24 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -411,8 +411,7 @@ static __always_inline unsigned int __kmalloc_index(size_t size, if (size <= 16 * 1024 * 1024) return 24; if (size <= 32 * 1024 * 1024) return 25; - if ((IS_ENABLED(CONFIG_CC_IS_GCC) || CONFIG_CLANG_VERSION >= 110000) - && !IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant) + if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant) BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()"); else BUG(); -- cgit v1.2.3 From e7f2be115f0746b969c0df14c0d182f65f005ca5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Oct 2021 16:10:55 +0200 Subject: sched/cputime: Fix getrusage(RUSAGE_THREAD) with nohz_full getrusage(RUSAGE_THREAD) with nohz_full may return shorter utime/stime than the actual time. task_cputime_adjusted() snapshots utime and stime and then adjust their sum to match the scheduler maintained cputime.sum_exec_runtime. Unfortunately in nohz_full, sum_exec_runtime is only updated once per second in the worst case, causing a discrepancy against utime and stime that can be updated anytime by the reader using vtime. To fix this situation, perform an update of cputime.sum_exec_runtime when the cputime snapshot reports the task as actually running while the tick is disabled. The related overhead is then contained within the relevant situations. Reported-by: Hasegawa Hitomi Signed-off-by: Frederic Weisbecker Signed-off-by: Hasegawa Hitomi Signed-off-by: Thomas Gleixner Tested-by: Masayoshi Mizuma Acked-by: Phil Auld Link: https://lore.kernel.org/r/20211026141055.57358-3-frederic@kernel.org --- include/linux/sched/cputime.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h index 6c9f19a33865..ce3c58286062 100644 --- a/include/linux/sched/cputime.h +++ b/include/linux/sched/cputime.h @@ -18,15 +18,16 @@ #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -extern void task_cputime(struct task_struct *t, +extern bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime); extern u64 task_gtime(struct task_struct *t); #else -static inline void task_cputime(struct task_struct *t, +static inline bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { *utime = t->utime; *stime = t->stime; + return false; } static inline u64 task_gtime(struct task_struct *t) -- cgit v1.2.3 From f83baa0cb6cfc92ebaf7f9d3a99d7e34f2e77a8a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 1 Dec 2021 19:35:01 +0100 Subject: HID: add hid_is_usb() function to make it simpler for USB detection A number of HID drivers already call hid_is_using_ll_driver() but only for the detection of if this is a USB device or not. Make this more obvious by creating hid_is_usb() and calling the function that way. Also converts the existing hid_is_using_ll_driver() functions to use the new call. Cc: Jiri Kosina Cc: Benjamin Tissoires Cc: linux-input@vger.kernel.org Cc: stable@vger.kernel.org Tested-by: Benjamin Tissoires Signed-off-by: Greg Kroah-Hartman Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20211201183503.2373082-1-gregkh@linuxfoundation.org --- include/linux/hid.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 9e067f937dbc..f453be385bd4 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -840,6 +840,11 @@ static inline bool hid_is_using_ll_driver(struct hid_device *hdev, return hdev->ll_driver == driver; } +static inline bool hid_is_usb(struct hid_device *hdev) +{ + return hid_is_using_ll_driver(hdev, &usb_hid_driver); +} + #define PM_HINT_FULLON 1<<5 #define PM_HINT_NORMAL 1<<1 -- cgit v1.2.3 From 9e3562080950b6e3fe38a5e34ddd5b1c618f2019 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Thu, 2 Dec 2021 10:53:33 +0100 Subject: HID: add suspend/resume helpers There is a lot of duplication of code in the HID low level drivers. Better have everything in one place so we can eventually extend it in a generic way. Signed-off-by: Benjamin Tissoires Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20211202095334.14399-4-benjamin.tissoires@redhat.com --- include/linux/hid.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 9e067f937dbc..ebe3ec98db6b 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -923,6 +923,16 @@ s32 hid_snto32(__u32 value, unsigned n); __u32 hid_field_extract(const struct hid_device *hid, __u8 *report, unsigned offset, unsigned n); +#ifdef CONFIG_PM +int hid_driver_suspend(struct hid_device *hdev, pm_message_t state); +int hid_driver_reset_resume(struct hid_device *hdev); +int hid_driver_resume(struct hid_device *hdev); +#else +static inline int hid_driver_suspend(struct hid_device *hdev, pm_message_t state) { return 0; } +static inline int hid_driver_reset_resume(struct hid_device *hdev) { return 0; } +static inline int hid_driver_resume(struct hid_device *hdev) { return 0; } +#endif + /** * hid_device_io_start - enable HID input during probe, remove * -- cgit v1.2.3 From f65a0b1f3e79444bba9ac56435eeb32db85ab2c9 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Thu, 2 Dec 2021 10:53:34 +0100 Subject: HID: do not inline some hid_hw_ functions We don't gain much by having them as inline, and it actually prevents us to attach a probe to those helpers. Signed-off-by: Benjamin Tissoires Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20211202095334.14399-5-benjamin.tissoires@redhat.com --- include/linux/hid.h | 68 +++++------------------------------------------------ 1 file changed, 6 insertions(+), 62 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index ebe3ec98db6b..b2fea7fc54a1 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1066,6 +1066,12 @@ int __must_check hid_hw_start(struct hid_device *hdev, void hid_hw_stop(struct hid_device *hdev); int __must_check hid_hw_open(struct hid_device *hdev); void hid_hw_close(struct hid_device *hdev); +void hid_hw_request(struct hid_device *hdev, + struct hid_report *report, int reqtype); +int hid_hw_raw_request(struct hid_device *hdev, + unsigned char reportnum, __u8 *buf, + size_t len, unsigned char rtype, int reqtype); +int hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len); /** * hid_hw_power - requests underlying HW to go into given power mode @@ -1083,68 +1089,6 @@ static inline int hid_hw_power(struct hid_device *hdev, int level) } -/** - * hid_hw_request - send report request to device - * - * @hdev: hid device - * @report: report to send - * @reqtype: hid request type - */ -static inline void hid_hw_request(struct hid_device *hdev, - struct hid_report *report, int reqtype) -{ - if (hdev->ll_driver->request) - return hdev->ll_driver->request(hdev, report, reqtype); - - __hid_request(hdev, report, reqtype); -} - -/** - * hid_hw_raw_request - send report request to device - * - * @hdev: hid device - * @reportnum: report ID - * @buf: in/out data to transfer - * @len: length of buf - * @rtype: HID report type - * @reqtype: HID_REQ_GET_REPORT or HID_REQ_SET_REPORT - * - * Return: count of data transferred, negative if error - * - * Same behavior as hid_hw_request, but with raw buffers instead. - */ -static inline int hid_hw_raw_request(struct hid_device *hdev, - unsigned char reportnum, __u8 *buf, - size_t len, unsigned char rtype, int reqtype) -{ - if (len < 1 || len > HID_MAX_BUFFER_SIZE || !buf) - return -EINVAL; - - return hdev->ll_driver->raw_request(hdev, reportnum, buf, len, - rtype, reqtype); -} - -/** - * hid_hw_output_report - send output report to device - * - * @hdev: hid device - * @buf: raw data to transfer - * @len: length of buf - * - * Return: count of data transferred, negative if error - */ -static inline int hid_hw_output_report(struct hid_device *hdev, __u8 *buf, - size_t len) -{ - if (len < 1 || len > HID_MAX_BUFFER_SIZE || !buf) - return -EINVAL; - - if (hdev->ll_driver->output_report) - return hdev->ll_driver->output_report(hdev, buf, len); - - return -ENOSYS; -} - /** * hid_hw_idle - send idle request to device * -- cgit v1.2.3 From 8293eb995f349aed28006792cad4cb48091919dd Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:25 -0800 Subject: bpf: Rename btf_member accessors. Rename btf_member_bit_offset() and btf_member_bitfield_size() to avoid conflicts with similarly named helpers in libbpf's btf.h. Rename the kernel helpers, since libbpf helpers are part of uapi. Suggested-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-3-alexei.starovoitov@gmail.com --- include/linux/btf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 203eef993d76..956f70388f69 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -194,15 +194,15 @@ static inline bool btf_type_kflag(const struct btf_type *t) return BTF_INFO_KFLAG(t->info); } -static inline u32 btf_member_bit_offset(const struct btf_type *struct_type, - const struct btf_member *member) +static inline u32 __btf_member_bit_offset(const struct btf_type *struct_type, + const struct btf_member *member) { return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset) : member->offset; } -static inline u32 btf_member_bitfield_size(const struct btf_type *struct_type, - const struct btf_member *member) +static inline u32 __btf_member_bitfield_size(const struct btf_type *struct_type, + const struct btf_member *member) { return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset) : 0; -- cgit v1.2.3 From 29db4bea1d10b73749d7992c1fc9ac13499e8871 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:26 -0800 Subject: bpf: Prepare relo_core.c for kernel duty. Make relo_core.c to be compiled for the kernel and for user space libbpf. Note the patch is reducing BPF_CORE_SPEC_MAX_LEN from 64 to 32. This is the maximum number of nested structs and arrays. For example: struct sample { int a; struct { int b[10]; }; }; struct sample *s = ...; int *y = &s->b[5]; This field access is encoded as "0:1:0:5" and spec len is 4. The follow up patch might bump it back to 64. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-4-alexei.starovoitov@gmail.com --- include/linux/btf.h | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 956f70388f69..acef6ef28768 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -144,6 +144,53 @@ static inline bool btf_type_is_enum(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM; } +static inline bool str_is_empty(const char *s) +{ + return !s || !s[0]; +} + +static inline u16 btf_kind(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info); +} + +static inline bool btf_is_enum(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ENUM; +} + +static inline bool btf_is_composite(const struct btf_type *t) +{ + u16 kind = btf_kind(t); + + return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; +} + +static inline bool btf_is_array(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ARRAY; +} + +static inline bool btf_is_int(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_INT; +} + +static inline bool btf_is_ptr(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_PTR; +} + +static inline u8 btf_int_offset(const struct btf_type *t) +{ + return BTF_INT_OFFSET(*(u32 *)(t + 1)); +} + +static inline u8 btf_int_encoding(const struct btf_type *t) +{ + return BTF_INT_ENCODING(*(u32 *)(t + 1)); +} + static inline bool btf_type_is_scalar(const struct btf_type *t) { return btf_type_is_int(t) || btf_type_is_enum(t); @@ -184,6 +231,11 @@ static inline u16 btf_type_vlen(const struct btf_type *t) return BTF_INFO_VLEN(t->info); } +static inline u16 btf_vlen(const struct btf_type *t) +{ + return btf_type_vlen(t); +} + static inline u16 btf_func_linkage(const struct btf_type *t) { return BTF_INFO_VLEN(t->info); @@ -208,11 +260,40 @@ static inline u32 __btf_member_bitfield_size(const struct btf_type *struct_type, : 0; } +static inline struct btf_member *btf_members(const struct btf_type *t) +{ + return (struct btf_member *)(t + 1); +} + +static inline u32 btf_member_bit_offset(const struct btf_type *t, u32 member_idx) +{ + const struct btf_member *m = btf_members(t) + member_idx; + + return __btf_member_bit_offset(t, m); +} + +static inline u32 btf_member_bitfield_size(const struct btf_type *t, u32 member_idx) +{ + const struct btf_member *m = btf_members(t) + member_idx; + + return __btf_member_bitfield_size(t, m); +} + static inline const struct btf_member *btf_type_member(const struct btf_type *t) { return (const struct btf_member *)(t + 1); } +static inline struct btf_array *btf_array(const struct btf_type *t) +{ + return (struct btf_array *)(t + 1); +} + +static inline struct btf_enum *btf_enum(const struct btf_type *t) +{ + return (struct btf_enum *)(t + 1); +} + static inline const struct btf_var_secinfo *btf_type_var_secinfo( const struct btf_type *t) { -- cgit v1.2.3 From fbd94c7afcf99c9f3b1ba1168657ecc428eb2c8d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:28 -0800 Subject: bpf: Pass a set of bpf_core_relo-s to prog_load command. struct bpf_core_relo is generated by llvm and processed by libbpf. It's a de-facto uapi. With CO-RE in the kernel the struct bpf_core_relo becomes uapi de-jure. Add an ability to pass a set of 'struct bpf_core_relo' to prog_load command and let the kernel perform CO-RE relocations. Note the struct bpf_line_info and struct bpf_func_info have the same layout when passed from LLVM to libbpf and from libbpf to the kernel except "insn_off" fields means "byte offset" when LLVM generates it. Then libbpf converts it to "insn index" to pass to the kernel. The struct bpf_core_relo's "insn_off" field is always "byte offset". Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-6-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cad0829710be..8bbf08fbab66 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1732,6 +1732,14 @@ bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); const struct btf_func_model * bpf_jit_find_kfunc_model(const struct bpf_prog *prog, const struct bpf_insn *insn); +struct bpf_core_ctx { + struct bpf_verifier_log *log; + const struct btf *btf; +}; + +int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, + int relo_idx, void *insn); + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { -- cgit v1.2.3 From d9847eb8be3d895b2b5f514fdf3885d47a0b92a2 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 22 Nov 2021 20:17:40 +0530 Subject: bpf: Make CONFIG_DEBUG_INFO_BTF depend upon CONFIG_BPF_SYSCALL Vinicius Costa Gomes reported [0] that build fails when CONFIG_DEBUG_INFO_BTF is enabled and CONFIG_BPF_SYSCALL is disabled. This leads to btf.c not being compiled, and then no symbol being present in vmlinux for the declarations in btf.h. Since BTF is not useful without enabling BPF subsystem, disallow this combination. However, theoretically disabling both now could still fail, as the symbol for kfunc_btf_id_list variables is not available. This isn't a problem as the compiler usually optimizes the whole register/unregister call, but at lower optimization levels it can fail the build in linking stage. Fix that by adding dummy variables so that modules taking address of them still work, but the whole thing is a noop. [0]: https://lore.kernel.org/bpf/20211110205418.332403-1-vinicius.gomes@intel.com Fixes: 14f267d95fe4 ("bpf: btf: Introduce helpers for dynamic BTF set registration") Reported-by: Vinicius Costa Gomes Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211122144742.477787-2-memxor@gmail.com --- include/linux/btf.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 203eef993d76..0e1b6281fd8f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -245,7 +245,10 @@ struct kfunc_btf_id_set { struct module *owner; }; -struct kfunc_btf_id_list; +struct kfunc_btf_id_list { + struct list_head list; + struct mutex mutex; +}; #ifdef CONFIG_DEBUG_INFO_BTF_MODULES void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, @@ -254,6 +257,9 @@ void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l, struct kfunc_btf_id_set *s); bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, struct module *owner); + +extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list; +extern struct kfunc_btf_id_list prog_test_kfunc_list; #else static inline void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, struct kfunc_btf_id_set *s) @@ -268,13 +274,13 @@ static inline bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, { return false; } + +static struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list __maybe_unused; +static struct kfunc_btf_id_list prog_test_kfunc_list __maybe_unused; #endif #define DEFINE_KFUNC_BTF_ID_SET(set, name) \ struct kfunc_btf_id_set name = { LIST_HEAD_INIT(name.list), (set), \ THIS_MODULE } -extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list; -extern struct kfunc_btf_id_list prog_test_kfunc_list; - #endif -- cgit v1.2.3 From b247f32aecad09e6cf7edff7739e6f2c9dc5fca9 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Thu, 28 Oct 2021 16:03:06 +0300 Subject: net/mlx5: Dynamically resize flow counters query buffer The flow counters bulk query buffer is allocated once during mlx5_fc_init_stats(). For PFs and VFs this buffer usually takes a little more than 512KB of memory, which is aligned to the next power of 2, to 1MB. For SFs, this buffer is reduced and takes around 128 Bytes. The buffer size determines the maximum number of flow counters that can be queried at a time. Thus, having a bigger buffer can improve performance for users that need to query many flow counters. There are cases that don't use many flow counters and don't need a big buffer (e.g. SFs, VFs). Since this size is critical with large scale, in these cases the buffer size should be reduced. In order to reduce memory consumption while maintaining query performance, change the query buffer's allocation scheme to the following: - First allocate the buffer with small initial size. - If the number of counters surpasses the initial size, resize the buffer to the maximum size. The buffer only grows and isn't shrank, because users with many flow counters don't care about the buffer size and we don't want to add resize overhead if the current number of counters drops. This solution is preferable to the current one, which is less accurate and only addresses SFs. Signed-off-by: Avihai Horon Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index a623ec635947..78655d8d13a7 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -478,6 +478,10 @@ struct mlx5_fc_stats { unsigned long next_query; unsigned long sampling_interval; /* jiffies */ u32 *bulk_query_out; + int bulk_query_len; + size_t num_counters; + bool bulk_query_alloc_failed; + unsigned long next_bulk_query_alloc; struct mlx5_fc_pool fc_pool; }; -- cgit v1.2.3 From e2748ad5257754a47376e28c0f9dda4f5c1e5ca3 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Tue, 2 Nov 2021 16:02:00 -0600 Subject: mtd: remove unused header file Commit d24dbd7541ff ("mtd: maps: Get rid of the latch-addr-flash driver") removed the last user of but left the header file behind. Nothing uses this file, delete it now. Cc: Boris Brezillon Cc: Miquel Raynal Cc: Richard Weinberger Cc: Vignesh Raghavendra Cc: linux-mtd@lists.infradead.org Signed-off-by: Jonathan Corbet Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20211102220203.940290-7-corbet@lwn.net --- include/linux/mtd/latch-addr-flash.h | 29 ----------------------------- 1 file changed, 29 deletions(-) delete mode 100644 include/linux/mtd/latch-addr-flash.h (limited to 'include/linux') diff --git a/include/linux/mtd/latch-addr-flash.h b/include/linux/mtd/latch-addr-flash.h deleted file mode 100644 index e94b8e128074..000000000000 --- a/include/linux/mtd/latch-addr-flash.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Interface for NOR flash driver whose high address lines are latched - * - * Copyright © 2008 MontaVista Software, Inc. - * - * This file is licensed under the terms of the GNU General Public License - * version 2. This program is licensed "as is" without any warranty of any - * kind, whether express or implied. - */ -#ifndef __LATCH_ADDR_FLASH__ -#define __LATCH_ADDR_FLASH__ - -struct map_info; -struct mtd_partition; - -struct latch_addr_flash_data { - unsigned int width; - unsigned int size; - - int (*init)(void *data, int cs); - void (*done)(void *data); - void (*set_window)(unsigned long offset, void *data); - void *data; - - unsigned int nr_parts; - struct mtd_partition *parts; -}; - -#endif -- cgit v1.2.3 From 2c4dcd7fd57b20a21b65da04d89c38a7217d79cf Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 29 Nov 2021 14:03:07 +0100 Subject: topology/sysfs: export die attributes only if an architectures has support The die_id and die_cpus topology sysfs attributes have been added with commit 0e344d8c709f ("cpu/topology: Export die_id") and commit 2e4c54dac7b3 ("topology: Create core_cpus and die_cpus sysfs attributes"). While they are currently only used and useful for x86 they are still present with bogus default values for all architectures. Instead of enforcing such new sysfs attributes to all architectures, make them only optional visible if an architecture opts in by defining both the topology_die_id and topology_die_cpumask attributes. This is similar to what was done when the book and drawer topology levels were introduced: avoid useless and therefore confusing sysfs attributes for architectures which cannot make use of them. This should not break any existing applications, since this is a rather new interface and applications should be able to handle also older kernel versions without such attributes - besides that they contain only useful information for x86. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Heiko Carstens Link: https://lore.kernel.org/r/20211129130309.3256168-2-hca@linux.ibm.com Signed-off-by: Greg Kroah-Hartman --- include/linux/topology.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index 0b3704ad13c8..8d1bdae76230 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -180,6 +180,10 @@ static inline int cpu_to_mem(int cpu) #endif /* [!]CONFIG_HAVE_MEMORYLESS_NODES */ +#if defined(topology_die_id) && defined(topology_die_cpumask) +#define TOPOLOGY_DIE_SYSFS +#endif + #ifndef topology_physical_package_id #define topology_physical_package_id(cpu) ((void)(cpu), -1) #endif -- cgit v1.2.3 From e795707703b32fecdd7467afcc33ff1e92416c05 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 29 Nov 2021 14:03:08 +0100 Subject: topology/sysfs: export cluster attributes only if an architectures has support The cluster_id and cluster_cpus topology sysfs attributes have been added with commit c5e22feffdd7 ("topology: Represent clusters of CPUs within a die"). They are currently only used for x86, arm64, and riscv (via generic arch topology), however they are still present with bogus default values for all other architectures. Instead of enforcing such new sysfs attributes to all architectures, make them only optional visible if an architecture opts in by defining both the topology_cluster_id and topology_cluster_cpumask attributes. This is similar to what was done when the book and drawer topology levels were introduced: avoid useless and therefore confusing sysfs attributes for architectures which cannot make use of them. This should not break any existing applications, since this is a new interface introduced with the v5.16 merge window. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Heiko Carstens Link: https://lore.kernel.org/r/20211129130309.3256168-3-hca@linux.ibm.com Signed-off-by: Greg Kroah-Hartman --- include/linux/topology.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index 8d1bdae76230..d52be69037db 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -183,6 +183,9 @@ static inline int cpu_to_mem(int cpu) #if defined(topology_die_id) && defined(topology_die_cpumask) #define TOPOLOGY_DIE_SYSFS #endif +#if defined(topology_cluster_id) && defined(topology_cluster_cpumask) +#define TOPOLOGY_CLUSTER_SYSFS +#endif #ifndef topology_physical_package_id #define topology_physical_package_id(cpu) ((void)(cpu), -1) -- cgit v1.2.3 From f1045056c726440469d89d23c13734bcd6c0d15b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 29 Nov 2021 14:03:09 +0100 Subject: topology/sysfs: rework book and drawer topology ifdefery Provide default defines for the topology_book_[id|cpumask] and topology_drawer_[id|cpumask] macros just like for each other topology level. This way all topology levels are handled in a similar way. Still the the book and drawer levels are only used on s390, and also the sysfs attributes are only created on s390. However other architectures may opt in if wanted. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Heiko Carstens Link: https://lore.kernel.org/r/20211129130309.3256168-4-hca@linux.ibm.com Signed-off-by: Greg Kroah-Hartman --- include/linux/topology.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index d52be69037db..a6e201758ae9 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -186,6 +186,12 @@ static inline int cpu_to_mem(int cpu) #if defined(topology_cluster_id) && defined(topology_cluster_cpumask) #define TOPOLOGY_CLUSTER_SYSFS #endif +#if defined(topology_book_id) && defined(topology_book_cpumask) +#define TOPOLOGY_BOOK_SYSFS +#endif +#if defined(topology_drawer_id) && defined(topology_drawer_cpumask) +#define TOPOLOGY_DRAWER_SYSFS +#endif #ifndef topology_physical_package_id #define topology_physical_package_id(cpu) ((void)(cpu), -1) @@ -199,6 +205,12 @@ static inline int cpu_to_mem(int cpu) #ifndef topology_core_id #define topology_core_id(cpu) ((void)(cpu), 0) #endif +#ifndef topology_book_id +#define topology_book_id(cpu) ((void)(cpu), -1) +#endif +#ifndef topology_drawer_id +#define topology_drawer_id(cpu) ((void)(cpu), -1) +#endif #ifndef topology_sibling_cpumask #define topology_sibling_cpumask(cpu) cpumask_of(cpu) #endif @@ -211,6 +223,12 @@ static inline int cpu_to_mem(int cpu) #ifndef topology_die_cpumask #define topology_die_cpumask(cpu) cpumask_of(cpu) #endif +#ifndef topology_book_cpumask +#define topology_book_cpumask(cpu) cpumask_of(cpu) +#endif +#ifndef topology_drawer_cpumask +#define topology_drawer_cpumask(cpu) cpumask_of(cpu) +#endif #if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask) static inline const struct cpumask *cpu_smt_mask(int cpu) -- cgit v1.2.3 From c7fdb2404f66131bc9c22e06f712717288826487 Mon Sep 17 00:00:00 2001 From: Abhyuday Godhasara Date: Sun, 28 Nov 2021 23:02:14 -0800 Subject: drivers: soc: xilinx: add xilinx event management driver Xilinx event management driver provides an interface to subscribe or unsubscribe for the event/callback supported by firmware. An agent can use this driver to register for Error Event, Device Event and Suspend callback. This driver only allows one agent per event to do registration. Driver will return an error in case of multiple registration for the same event. This driver gets notification from firmware through TF-A as SGI. During initialization, event manager driver register handler for SGI used for notification. It also provides SGI number info to TF-A by using IOCTL_REGISTER_SGI call to TF-A. After receiving notification from firmware, the driver makes an SMC call to TF-A to get IPI data. From the IPI data provided by TF-A, event manager identified the cause of event and forward that event/callback notification to the respective subscribed driver. After this, in case of Error Event, driver performs unregistration as firmware expecting from agent to do re-registration if the agent wants to get notified on the second occurrence of an error event. Add new IOCTL id IOCTL_REGISTER_SGI = 25 which is used to register SGI on TF-A. Older firmware doesn't have all required support for event handling which is required by the event manager driver. So add check for the register notifier version in the event manager driver. Xilinx event management driver provides support to subscribe for multiple error events with the use of Event Mask in a single call of xlnx_register_event(). Agent driver can provide 'Event' parameter value as ORed of multiple event masks to register single callback for multiple events. For example, to register callback for event=0x1 and event=0x2 for the given node, agent can provide event=0x3 (0x1 | 0x2). It is not possible to register multiple events for different nodes in a single registration call. Also provide support to receive multiple error events as in single notification from firmware and then forward it to subscribed drivers via registered callback one by one. Acked-by: Michal Simek Signed-off-by: Tejas Patel Signed-off-by: Rajan Vaja Signed-off-by: Abhyuday Godhasara Link: https://lore.kernel.org/r/20211129070216.30253-2-abhyuday.godhasara@xilinx.com Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware/xlnx-event-manager.h | 36 +++++++++++++++++++++++++++++ include/linux/firmware/xlnx-zynqmp.h | 2 ++ 2 files changed, 38 insertions(+) create mode 100644 include/linux/firmware/xlnx-event-manager.h (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-event-manager.h b/include/linux/firmware/xlnx-event-manager.h new file mode 100644 index 000000000000..3f87c4929d21 --- /dev/null +++ b/include/linux/firmware/xlnx-event-manager.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _FIRMWARE_XLNX_EVENT_MANAGER_H_ +#define _FIRMWARE_XLNX_EVENT_MANAGER_H_ + +#include + +#define CB_MAX_PAYLOAD_SIZE (4U) /*In payload maximum 32bytes */ + +/************************** Exported Function *****************************/ + +typedef void (*event_cb_func_t)(const u32 *payload, void *data); + +#if IS_REACHABLE(CONFIG_XLNX_EVENT_MANAGER) +int xlnx_register_event(const enum pm_api_cb_id cb_type, const u32 node_id, + const u32 event, const bool wake, + event_cb_func_t cb_fun, void *data); + +int xlnx_unregister_event(const enum pm_api_cb_id cb_type, const u32 node_id, + const u32 event, event_cb_func_t cb_fun); +#else +static inline int xlnx_register_event(const enum pm_api_cb_id cb_type, const u32 node_id, + const u32 event, const bool wake, + event_cb_func_t cb_fun, void *data) +{ + return -ENODEV; +} + +static inline int xlnx_unregister_event(const enum pm_api_cb_id cb_type, const u32 node_id, + const u32 event, event_cb_func_t cb_fun) +{ + return -ENODEV; +} +#endif + +#endif /* _FIRMWARE_XLNX_EVENT_MANAGER_H_ */ diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 077e894bb340..907cb01890cf 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -141,6 +141,8 @@ enum pm_ioctl_id { /* Set healthy bit value */ IOCTL_SET_BOOT_HEALTH_STATUS = 17, IOCTL_OSPI_MUX_SELECT = 21, + /* Register SGI to ATF */ + IOCTL_REGISTER_SGI = 25, }; enum pm_query_id { -- cgit v1.2.3 From 14866a7db8da1f61fb6135c461b733694eea9580 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 1 Dec 2021 20:43:03 -0800 Subject: Documentation/auxiliary_bus: Add example code for module_auxiliary_driver() Add an example code snipit to the module_auxiliary_driver() documentation which is consistent with the other example code in the elsewhere in the documentation. Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20211202044305.4006853-6-ira.weiny@intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/auxiliary_bus.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h index fc51d45f106b..605b27aab693 100644 --- a/include/linux/auxiliary_bus.h +++ b/include/linux/auxiliary_bus.h @@ -66,6 +66,10 @@ void auxiliary_driver_unregister(struct auxiliary_driver *auxdrv); * Helper macro for auxiliary drivers which do not do anything special in * module init/exit. This eliminates a lot of boilerplate. Each module may only * use this macro once, and calling it replaces module_init() and module_exit() + * + * .. code-block:: c + * + * module_auxiliary_driver(my_drv); */ #define module_auxiliary_driver(__auxiliary_driver) \ module_driver(__auxiliary_driver, auxiliary_driver_register, auxiliary_driver_unregister) -- cgit v1.2.3 From e1b5186810cc7d4ec60447032636b8e6772dbbc6 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 1 Dec 2021 20:43:05 -0800 Subject: Documentation/auxiliary_bus: Move the text into the code The code and documentation are more difficult to maintain when kept separately. This is further compounded when the standard structure documentation infrastructure is not used. Move the documentation into the code, use the standard documentation infrastructure, add current documented functions, and reference the text in the rst file. Suggested-by: Greg Kroah-Hartman Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20211202044305.4006853-8-ira.weiny@intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/auxiliary_bus.h | 160 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) (limited to 'include/linux') diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h index 605b27aab693..e6d8b5c16226 100644 --- a/include/linux/auxiliary_bus.h +++ b/include/linux/auxiliary_bus.h @@ -11,12 +11,172 @@ #include #include +/** + * DOC: DEVICE_LIFESPAN + * + * The registering driver is the entity that allocates memory for the + * auxiliary_device and registers it on the auxiliary bus. It is important to + * note that, as opposed to the platform bus, the registering driver is wholly + * responsible for the management of the memory used for the device object. + * + * To be clear the memory for the auxiliary_device is freed in the release() + * callback defined by the registering driver. The registering driver should + * only call auxiliary_device_delete() and then auxiliary_device_uninit() when + * it is done with the device. The release() function is then automatically + * called if and when other code releases their reference to the devices. + * + * A parent object, defined in the shared header file, contains the + * auxiliary_device. It also contains a pointer to the shared object(s), which + * also is defined in the shared header. Both the parent object and the shared + * object(s) are allocated by the registering driver. This layout allows the + * auxiliary_driver's registering module to perform a container_of() call to go + * from the pointer to the auxiliary_device, that is passed during the call to + * the auxiliary_driver's probe function, up to the parent object, and then + * have access to the shared object(s). + * + * The memory for the shared object(s) must have a lifespan equal to, or + * greater than, the lifespan of the memory for the auxiliary_device. The + * auxiliary_driver should only consider that the shared object is valid as + * long as the auxiliary_device is still registered on the auxiliary bus. It + * is up to the registering driver to manage (e.g. free or keep available) the + * memory for the shared object beyond the life of the auxiliary_device. + * + * The registering driver must unregister all auxiliary devices before its own + * driver.remove() is completed. An easy way to ensure this is to use the + * devm_add_action_or_reset() call to register a function against the parent + * device which unregisters the auxiliary device object(s). + * + * Finally, any operations which operate on the auxiliary devices must continue + * to function (if only to return an error) after the registering driver + * unregisters the auxiliary device. + */ + +/** + * struct auxiliary_device - auxiliary device object. + * @dev: Device, + * The release and parent fields of the device structure must be filled + * in + * @name: Match name found by the auxiliary device driver, + * @id: unique identitier if multiple devices of the same name are exported, + * + * An auxiliary_device represents a part of its parent device's functionality. + * It is given a name that, combined with the registering drivers + * KBUILD_MODNAME, creates a match_name that is used for driver binding, and an + * id that combined with the match_name provide a unique name to register with + * the bus subsystem. For example, a driver registering an auxiliary device is + * named 'foo_mod.ko' and the subdevice is named 'foo_dev'. The match name is + * therefore 'foo_mod.foo_dev'. + * + * Registering an auxiliary_device is a three-step process. + * + * First, a 'struct auxiliary_device' needs to be defined or allocated for each + * sub-device desired. The name, id, dev.release, and dev.parent fields of + * this structure must be filled in as follows. + * + * The 'name' field is to be given a name that is recognized by the auxiliary + * driver. If two auxiliary_devices with the same match_name, eg + * "foo_mod.foo_dev", are registered onto the bus, they must have unique id + * values (e.g. "x" and "y") so that the registered devices names are + * "foo_mod.foo_dev.x" and "foo_mod.foo_dev.y". If match_name + id are not + * unique, then the device_add fails and generates an error message. + * + * The auxiliary_device.dev.type.release or auxiliary_device.dev.release must + * be populated with a non-NULL pointer to successfully register the + * auxiliary_device. This release call is where resources associated with the + * auxiliary device must be free'ed. Because once the device is placed on the + * bus the parent driver can not tell what other code may have a reference to + * this data. + * + * The auxiliary_device.dev.parent should be set. Typically to the registering + * drivers device. + * + * Second, call auxiliary_device_init(), which checks several aspects of the + * auxiliary_device struct and performs a device_initialize(). After this step + * completes, any error state must have a call to auxiliary_device_uninit() in + * its resolution path. + * + * The third and final step in registering an auxiliary_device is to perform a + * call to auxiliary_device_add(), which sets the name of the device and adds + * the device to the bus. + * + * .. code-block:: c + * + * #define MY_DEVICE_NAME "foo_dev" + * + * ... + * + * struct auxiliary_device *my_aux_dev = my_aux_dev_alloc(xxx); + * + * // Step 1: + * my_aux_dev->name = MY_DEVICE_NAME; + * my_aux_dev->id = my_unique_id_alloc(xxx); + * my_aux_dev->dev.release = my_aux_dev_release; + * my_aux_dev->dev.parent = my_dev; + * + * // Step 2: + * if (auxiliary_device_init(my_aux_dev)) + * goto fail; + * + * // Step 3: + * if (auxiliary_device_add(my_aux_dev)) { + * auxiliary_device_uninit(my_aux_dev); + * goto fail; + * } + * + * ... + * + * + * Unregistering an auxiliary_device is a two-step process to mirror the + * register process. First call auxiliary_device_delete(), then call + * auxiliary_device_uninit(). + * + * .. code-block:: c + * + * auxiliary_device_delete(my_dev->my_aux_dev); + * auxiliary_device_uninit(my_dev->my_aux_dev); + */ struct auxiliary_device { struct device dev; const char *name; u32 id; }; +/** + * struct auxiliary_driver - Definition of an auxiliary bus driver + * @probe: Called when a matching device is added to the bus. + * @remove: Called when device is removed from the bus. + * @shutdown: Called at shut-down time to quiesce the device. + * @suspend: Called to put the device to sleep mode. Usually to a power state. + * @resume: Called to bring a device from sleep mode. + * @name: Driver name. + * @driver: Core driver structure. + * @id_table: Table of devices this driver should match on the bus. + * + * Auxiliary drivers follow the standard driver model convention, where + * discovery/enumeration is handled by the core, and drivers provide probe() + * and remove() methods. They support power management and shutdown + * notifications using the standard conventions. + * + * Auxiliary drivers register themselves with the bus by calling + * auxiliary_driver_register(). The id_table contains the match_names of + * auxiliary devices that a driver can bind with. + * + * .. code-block:: c + * + * static const struct auxiliary_device_id my_auxiliary_id_table[] = { + * { .name = "foo_mod.foo_dev" }, + * {}, + * }; + * + * MODULE_DEVICE_TABLE(auxiliary, my_auxiliary_id_table); + * + * struct auxiliary_driver my_drv = { + * .name = "myauxiliarydrv", + * .id_table = my_auxiliary_id_table, + * .probe = my_drv_probe, + * .remove = my_drv_remove + * }; + */ struct auxiliary_driver { int (*probe)(struct auxiliary_device *auxdev, const struct auxiliary_device_id *id); void (*remove)(struct auxiliary_device *auxdev); -- cgit v1.2.3 From bb49e9e730c2906a958eee273a7819f401543d6c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 3 Dec 2021 12:16:58 +0100 Subject: fs: add is_idmapped_mnt() helper Multiple places open-code the same check to determine whether a given mount is idmapped. Introduce a simple helper function that can be used instead. This allows us to get rid of the fragile open-coding. We will later change the check that is used to determine whether a given mount is idmapped. Introducing a helper allows us to do this in a single place instead of doing it for multiple places. Link: https://lore.kernel.org/r/20211123114227.3124056-2-brauner@kernel.org (v1) Link: https://lore.kernel.org/r/20211130121032.3753852-2-brauner@kernel.org (v2) Link: https://lore.kernel.org/r/20211203111707.3901969-2-brauner@kernel.org Cc: Seth Forshee Cc: Christoph Hellwig Cc: Al Viro CC: linux-fsdevel@vger.kernel.org Reviewed-by: Amir Goldstein Reviewed-by: Seth Forshee Signed-off-by: Christian Brauner --- include/linux/fs.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index bbf812ce89a8..06cbefd76de7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2724,6 +2724,20 @@ static inline struct user_namespace *file_mnt_user_ns(struct file *file) { return mnt_user_ns(file->f_path.mnt); } + +/** + * is_idmapped_mnt - check whether a mount is mapped + * @mnt: the mount to check + * + * If @mnt has an idmapping attached to it @mnt is mapped. + * + * Return: true if mount is mapped, false if not. + */ +static inline bool is_idmapped_mnt(const struct vfsmount *mnt) +{ + return mnt_user_ns(mnt) != &init_user_ns; +} + extern long vfs_truncate(const struct path *, loff_t); int do_truncate(struct user_namespace *, struct dentry *, loff_t start, unsigned int time_attrs, struct file *filp); -- cgit v1.2.3 From a793d79ea3e041081cd7cbd8ee43d0b5e4914a2b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 3 Dec 2021 12:16:59 +0100 Subject: fs: move mapping helpers The low-level mapping helpers were so far crammed into fs.h. They are out of place there. The fs.h header should just contain the higher-level mapping helpers that interact directly with vfs objects such as struct super_block or struct inode and not the bare mapping helpers. Similarly, only vfs and specific fs code shall interact with low-level mapping helpers. And so they won't be made accessible automatically through regular {g,u}id helpers. Link: https://lore.kernel.org/r/20211123114227.3124056-3-brauner@kernel.org (v1) Link: https://lore.kernel.org/r/20211130121032.3753852-3-brauner@kernel.org (v2) Link: https://lore.kernel.org/r/20211203111707.3901969-3-brauner@kernel.org Cc: Seth Forshee Cc: Christoph Hellwig Cc: Al Viro CC: linux-fsdevel@vger.kernel.org Reviewed-by: Amir Goldstein Reviewed-by: Seth Forshee Signed-off-by: Christian Brauner --- include/linux/fs.h | 91 +------------------------------------ include/linux/mnt_idmapping.h | 101 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 90 deletions(-) create mode 100644 include/linux/mnt_idmapping.h (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 06cbefd76de7..b3bcb2129699 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -1624,34 +1625,6 @@ static inline void i_gid_write(struct inode *inode, gid_t gid) inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid); } -/** - * kuid_into_mnt - map a kuid down into a mnt_userns - * @mnt_userns: user namespace of the relevant mount - * @kuid: kuid to be mapped - * - * Return: @kuid mapped according to @mnt_userns. - * If @kuid has no mapping INVALID_UID is returned. - */ -static inline kuid_t kuid_into_mnt(struct user_namespace *mnt_userns, - kuid_t kuid) -{ - return make_kuid(mnt_userns, __kuid_val(kuid)); -} - -/** - * kgid_into_mnt - map a kgid down into a mnt_userns - * @mnt_userns: user namespace of the relevant mount - * @kgid: kgid to be mapped - * - * Return: @kgid mapped according to @mnt_userns. - * If @kgid has no mapping INVALID_GID is returned. - */ -static inline kgid_t kgid_into_mnt(struct user_namespace *mnt_userns, - kgid_t kgid) -{ - return make_kgid(mnt_userns, __kgid_val(kgid)); -} - /** * i_uid_into_mnt - map an inode's i_uid down into a mnt_userns * @mnt_userns: user namespace of the mount the inode was found from @@ -1680,68 +1653,6 @@ static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns, return kgid_into_mnt(mnt_userns, inode->i_gid); } -/** - * kuid_from_mnt - map a kuid up into a mnt_userns - * @mnt_userns: user namespace of the relevant mount - * @kuid: kuid to be mapped - * - * Return: @kuid mapped up according to @mnt_userns. - * If @kuid has no mapping INVALID_UID is returned. - */ -static inline kuid_t kuid_from_mnt(struct user_namespace *mnt_userns, - kuid_t kuid) -{ - return KUIDT_INIT(from_kuid(mnt_userns, kuid)); -} - -/** - * kgid_from_mnt - map a kgid up into a mnt_userns - * @mnt_userns: user namespace of the relevant mount - * @kgid: kgid to be mapped - * - * Return: @kgid mapped up according to @mnt_userns. - * If @kgid has no mapping INVALID_GID is returned. - */ -static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns, - kgid_t kgid) -{ - return KGIDT_INIT(from_kgid(mnt_userns, kgid)); -} - -/** - * mapped_fsuid - return caller's fsuid mapped up into a mnt_userns - * @mnt_userns: user namespace of the relevant mount - * - * Use this helper to initialize a new vfs or filesystem object based on - * the caller's fsuid. A common example is initializing the i_uid field of - * a newly allocated inode triggered by a creation event such as mkdir or - * O_CREAT. Other examples include the allocation of quotas for a specific - * user. - * - * Return: the caller's current fsuid mapped up according to @mnt_userns. - */ -static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns) -{ - return kuid_from_mnt(mnt_userns, current_fsuid()); -} - -/** - * mapped_fsgid - return caller's fsgid mapped up into a mnt_userns - * @mnt_userns: user namespace of the relevant mount - * - * Use this helper to initialize a new vfs or filesystem object based on - * the caller's fsgid. A common example is initializing the i_gid field of - * a newly allocated inode triggered by a creation event such as mkdir or - * O_CREAT. Other examples include the allocation of quotas for a specific - * user. - * - * Return: the caller's current fsgid mapped up according to @mnt_userns. - */ -static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns) -{ - return kgid_from_mnt(mnt_userns, current_fsgid()); -} - /** * inode_fsuid_set - initialize inode's i_uid field with callers fsuid * @inode: inode to initialize diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h new file mode 100644 index 000000000000..47c7811fadfe --- /dev/null +++ b/include/linux/mnt_idmapping.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MNT_IDMAPPING_H +#define _LINUX_MNT_IDMAPPING_H + +#include +#include + +struct user_namespace; +extern struct user_namespace init_user_ns; + +/** + * kuid_into_mnt - map a kuid down into a mnt_userns + * @mnt_userns: user namespace of the relevant mount + * @kuid: kuid to be mapped + * + * Return: @kuid mapped according to @mnt_userns. + * If @kuid has no mapping INVALID_UID is returned. + */ +static inline kuid_t kuid_into_mnt(struct user_namespace *mnt_userns, + kuid_t kuid) +{ + return make_kuid(mnt_userns, __kuid_val(kuid)); +} + +/** + * kgid_into_mnt - map a kgid down into a mnt_userns + * @mnt_userns: user namespace of the relevant mount + * @kgid: kgid to be mapped + * + * Return: @kgid mapped according to @mnt_userns. + * If @kgid has no mapping INVALID_GID is returned. + */ +static inline kgid_t kgid_into_mnt(struct user_namespace *mnt_userns, + kgid_t kgid) +{ + return make_kgid(mnt_userns, __kgid_val(kgid)); +} + +/** + * kuid_from_mnt - map a kuid up into a mnt_userns + * @mnt_userns: user namespace of the relevant mount + * @kuid: kuid to be mapped + * + * Return: @kuid mapped up according to @mnt_userns. + * If @kuid has no mapping INVALID_UID is returned. + */ +static inline kuid_t kuid_from_mnt(struct user_namespace *mnt_userns, + kuid_t kuid) +{ + return KUIDT_INIT(from_kuid(mnt_userns, kuid)); +} + +/** + * kgid_from_mnt - map a kgid up into a mnt_userns + * @mnt_userns: user namespace of the relevant mount + * @kgid: kgid to be mapped + * + * Return: @kgid mapped up according to @mnt_userns. + * If @kgid has no mapping INVALID_GID is returned. + */ +static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns, + kgid_t kgid) +{ + return KGIDT_INIT(from_kgid(mnt_userns, kgid)); +} + +/** + * mapped_fsuid - return caller's fsuid mapped up into a mnt_userns + * @mnt_userns: user namespace of the relevant mount + * + * Use this helper to initialize a new vfs or filesystem object based on + * the caller's fsuid. A common example is initializing the i_uid field of + * a newly allocated inode triggered by a creation event such as mkdir or + * O_CREAT. Other examples include the allocation of quotas for a specific + * user. + * + * Return: the caller's current fsuid mapped up according to @mnt_userns. + */ +static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns) +{ + return kuid_from_mnt(mnt_userns, current_fsuid()); +} + +/** + * mapped_fsgid - return caller's fsgid mapped up into a mnt_userns + * @mnt_userns: user namespace of the relevant mount + * + * Use this helper to initialize a new vfs or filesystem object based on + * the caller's fsgid. A common example is initializing the i_gid field of + * a newly allocated inode triggered by a creation event such as mkdir or + * O_CREAT. Other examples include the allocation of quotas for a specific + * user. + * + * Return: the caller's current fsgid mapped up according to @mnt_userns. + */ +static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns) +{ + return kgid_from_mnt(mnt_userns, current_fsgid()); +} + +#endif /* _LINUX_MNT_IDMAPPING_H */ -- cgit v1.2.3 From 476860b3eb4a50958243158861d5340066df5af2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 3 Dec 2021 12:17:00 +0100 Subject: fs: tweak fsuidgid_has_mapping() If the caller's fs{g,u}id aren't mapped in the mount's idmapping we can return early and skip the check whether the mapped fs{g,u}id also have a mapping in the filesystem's idmapping. If the fs{g,u}id aren't mapped in the mount's idmapping they consequently can't be mapped in the filesystem's idmapping. So there's no point in checking that. Link: https://lore.kernel.org/r/20211123114227.3124056-4-brauner@kernel.org (v1) Link: https://lore.kernel.org/r/20211130121032.3753852-4-brauner@kernel.org (v2) Link: https://lore.kernel.org/r/20211203111707.3901969-4-brauner@kernel.org Cc: Seth Forshee Cc: Christoph Hellwig Cc: Al Viro CC: linux-fsdevel@vger.kernel.org Reviewed-by: Amir Goldstein Reviewed-by: Seth Forshee Signed-off-by: Christian Brauner --- include/linux/fs.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b3bcb2129699..db5ee15e36b1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1695,10 +1695,18 @@ static inline void inode_fsgid_set(struct inode *inode, static inline bool fsuidgid_has_mapping(struct super_block *sb, struct user_namespace *mnt_userns) { - struct user_namespace *s_user_ns = sb->s_user_ns; + struct user_namespace *fs_userns = sb->s_user_ns; + kuid_t kuid; + kgid_t kgid; - return kuid_has_mapping(s_user_ns, mapped_fsuid(mnt_userns)) && - kgid_has_mapping(s_user_ns, mapped_fsgid(mnt_userns)); + kuid = mapped_fsuid(mnt_userns); + if (!uid_valid(kuid)) + return false; + kgid = mapped_fsgid(mnt_userns); + if (!gid_valid(kgid)) + return false; + return kuid_has_mapping(fs_userns, kuid) && + kgid_has_mapping(fs_userns, kgid); } extern struct timespec64 current_time(struct inode *inode); -- cgit v1.2.3 From 1ac2a4104968e0a60b4b3572216a92aab5c1b025 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 3 Dec 2021 12:17:01 +0100 Subject: fs: account for filesystem mappings Currently we only support idmapped mounts for filesystems mounted without an idmapping. This was a conscious decision mentioned in multiple places (cf. e.g. [1]). As explained at length in [3] it is perfectly fine to extend support for idmapped mounts to filesystem's mounted with an idmapping should the need arise. The need has been there for some time now. Various container projects in userspace need this to run unprivileged and nested unprivileged containers (cf. [2]). Before we can port any filesystem that is mountable with an idmapping to support idmapped mounts we need to first extend the mapping helpers to account for the filesystem's idmapping. This again, is explained at length in our documentation at [3] but I'll give an overview here again. Currently, the low-level mapping helpers implement the remapping algorithms described in [3] in a simplified manner. Because we could rely on the fact that all filesystems supporting idmapped mounts are mounted without an idmapping the translation step from or into the filesystem idmapping could be skipped. In order to support idmapped mounts of filesystem's mountable with an idmapping the translation step we were able to skip before cannot be skipped anymore. A filesystem mounted with an idmapping is very likely to not use an identity mapping and will instead use a non-identity mapping. So the translation step from or into the filesystem's idmapping in the remapping algorithm cannot be skipped for such filesystems. More details with examples can be found in [3]. This patch adds a few new and prepares some already existing low-level mapping helpers to perform the full translation algorithm explained in [3]. The low-level helpers can be written in a way that they only perform the additional translation step when the filesystem is indeed mounted with an idmapping. If the low-level helpers detect that they are not dealing with an idmapped mount they can simply return the relevant k{g,u}id unchanged; no remapping needs to be performed at all. The no_idmapping() helper detects whether the shortcut can be used. If the low-level helpers detected that they are dealing with an idmapped mount but the underlying filesystem is mounted without an idmapping we can rely on the previous shorcut and can continue to skip the translation step from or into the filesystem's idmapping. These checks guarantee that only the minimal amount of work is performed. As before, if idmapped mounts aren't used the low-level helpers are idempotent and no work is performed at all. This patch adds the helpers mapped_k{g,u}id_fs() and mapped_k{g,u}id_user(). Following patches will port all places to replace the old k{g,u}id_into_mnt() and k{g,u}id_from_mnt() with these two new helpers. After the conversion is done k{g,u}id_into_mnt() and k{g,u}id_from_mnt() will be removed. This also concludes the renaming of the mapping helpers we started in [4]. Now, all mapping helpers will started with the "mapped_" prefix making everything nice and consistent. The mapped_k{g,u}id_fs() helpers replace the k{g,u}id_into_mnt() helpers. They are to be used when k{g,u}ids are to be mapped from the vfs, e.g. from from struct inode's i_{g,u}id. Conversely, the mapped_k{g,u}id_user() helpers replace the k{g,u}id_from_mnt() helpers. They are to be used when k{g,u}ids are to be written to disk, e.g. when entering from a system call to change ownership of a file. This patch only introduces the helpers. It doesn't yet convert the relevant places to account for filesystem mounted with an idmapping. [1]: commit 2ca4dcc4909d ("fs/mount_setattr: tighten permission checks") [2]: https://github.com/containers/podman/issues/10374 [3]: Documentations/filesystems/idmappings.rst [4]: commit a65e58e791a1 ("fs: document and rename fsid helpers") Link: https://lore.kernel.org/r/20211123114227.3124056-5-brauner@kernel.org (v1) Link: https://lore.kernel.org/r/20211130121032.3753852-5-brauner@kernel.org (v2) Link: https://lore.kernel.org/r/20211203111707.3901969-5-brauner@kernel.org Cc: Seth Forshee Cc: Amir Goldstein Cc: Christoph Hellwig Cc: Al Viro CC: linux-fsdevel@vger.kernel.org Reviewed-by: Seth Forshee Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 +- include/linux/mnt_idmapping.h | 193 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 191 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index db5ee15e36b1..57aee6ebba72 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1636,7 +1636,7 @@ static inline void i_gid_write(struct inode *inode, gid_t gid) static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns, const struct inode *inode) { - return kuid_into_mnt(mnt_userns, inode->i_uid); + return mapped_kuid_fs(mnt_userns, &init_user_ns, inode->i_uid); } /** @@ -1650,7 +1650,7 @@ static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns, static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns, const struct inode *inode) { - return kgid_into_mnt(mnt_userns, inode->i_gid); + return mapped_kgid_fs(mnt_userns, &init_user_ns, inode->i_gid); } /** diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index 47c7811fadfe..60341cd33ccc 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -6,6 +6,11 @@ #include struct user_namespace; +/* + * Carries the initial idmapping of 0:0:4294967295 which is an identity + * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is + * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. + */ extern struct user_namespace init_user_ns; /** @@ -64,9 +69,189 @@ static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns, return KGIDT_INIT(from_kgid(mnt_userns, kgid)); } +/** + * initial_idmapping - check whether this is the initial mapping + * @ns: idmapping to check + * + * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1, + * [...], 1000 to 1000 [...]. + * + * Return: true if this is the initial mapping, false if not. + */ +static inline bool initial_idmapping(const struct user_namespace *ns) +{ + return ns == &init_user_ns; +} + +/** + * no_idmapping - check whether we can skip remapping a kuid/gid + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * + * This function can be used to check whether a remapping between two + * idmappings is required. + * An idmapped mount is a mount that has an idmapping attached to it that + * is different from the filsystem's idmapping and the initial idmapping. + * If the initial mapping is used or the idmapping of the mount and the + * filesystem are identical no remapping is required. + * + * Return: true if remapping can be skipped, false if not. + */ +static inline bool no_idmapping(const struct user_namespace *mnt_userns, + const struct user_namespace *fs_userns) +{ + return initial_idmapping(mnt_userns) || mnt_userns == fs_userns; +} + +/** + * mapped_kuid_fs - map a filesystem kuid into a mnt_userns + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @kuid : kuid to be mapped + * + * Take a @kuid and remap it from @fs_userns into @mnt_userns. Use this + * function when preparing a @kuid to be reported to userspace. + * + * If no_idmapping() determines that this is not an idmapped mount we can + * simply return @kuid unchanged. + * If initial_idmapping() tells us that the filesystem is not mounted with an + * idmapping we know the value of @kuid won't change when calling + * from_kuid() so we can simply retrieve the value via __kuid_val() + * directly. + * + * Return: @kuid mapped according to @mnt_userns. + * If @kuid has no mapping in either @mnt_userns or @fs_userns INVALID_UID is + * returned. + */ +static inline kuid_t mapped_kuid_fs(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + kuid_t kuid) +{ + uid_t uid; + + if (no_idmapping(mnt_userns, fs_userns)) + return kuid; + if (initial_idmapping(fs_userns)) + uid = __kuid_val(kuid); + else + uid = from_kuid(fs_userns, kuid); + if (uid == (uid_t)-1) + return INVALID_UID; + return make_kuid(mnt_userns, uid); +} + +/** + * mapped_kgid_fs - map a filesystem kgid into a mnt_userns + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @kgid : kgid to be mapped + * + * Take a @kgid and remap it from @fs_userns into @mnt_userns. Use this + * function when preparing a @kgid to be reported to userspace. + * + * If no_idmapping() determines that this is not an idmapped mount we can + * simply return @kgid unchanged. + * If initial_idmapping() tells us that the filesystem is not mounted with an + * idmapping we know the value of @kgid won't change when calling + * from_kgid() so we can simply retrieve the value via __kgid_val() + * directly. + * + * Return: @kgid mapped according to @mnt_userns. + * If @kgid has no mapping in either @mnt_userns or @fs_userns INVALID_GID is + * returned. + */ +static inline kgid_t mapped_kgid_fs(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + kgid_t kgid) +{ + gid_t gid; + + if (no_idmapping(mnt_userns, fs_userns)) + return kgid; + if (initial_idmapping(fs_userns)) + gid = __kgid_val(kgid); + else + gid = from_kgid(fs_userns, kgid); + if (gid == (gid_t)-1) + return INVALID_GID; + return make_kgid(mnt_userns, gid); +} + +/** + * mapped_kuid_user - map a user kuid into a mnt_userns + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @kuid : kuid to be mapped + * + * Use the idmapping of @mnt_userns to remap a @kuid into @fs_userns. Use this + * function when preparing a @kuid to be written to disk or inode. + * + * If no_idmapping() determines that this is not an idmapped mount we can + * simply return @kuid unchanged. + * If initial_idmapping() tells us that the filesystem is not mounted with an + * idmapping we know the value of @kuid won't change when calling + * make_kuid() so we can simply retrieve the value via KUIDT_INIT() + * directly. + * + * Return: @kuid mapped according to @mnt_userns. + * If @kuid has no mapping in either @mnt_userns or @fs_userns INVALID_UID is + * returned. + */ +static inline kuid_t mapped_kuid_user(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + kuid_t kuid) +{ + uid_t uid; + + if (no_idmapping(mnt_userns, fs_userns)) + return kuid; + uid = from_kuid(mnt_userns, kuid); + if (uid == (uid_t)-1) + return INVALID_UID; + if (initial_idmapping(fs_userns)) + return KUIDT_INIT(uid); + return make_kuid(fs_userns, uid); +} + +/** + * mapped_kgid_user - map a user kgid into a mnt_userns + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @kgid : kgid to be mapped + * + * Use the idmapping of @mnt_userns to remap a @kgid into @fs_userns. Use this + * function when preparing a @kgid to be written to disk or inode. + * + * If no_idmapping() determines that this is not an idmapped mount we can + * simply return @kgid unchanged. + * If initial_idmapping() tells us that the filesystem is not mounted with an + * idmapping we know the value of @kgid won't change when calling + * make_kgid() so we can simply retrieve the value via KGIDT_INIT() + * directly. + * + * Return: @kgid mapped according to @mnt_userns. + * If @kgid has no mapping in either @mnt_userns or @fs_userns INVALID_GID is + * returned. + */ +static inline kgid_t mapped_kgid_user(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + kgid_t kgid) +{ + gid_t gid; + + if (no_idmapping(mnt_userns, fs_userns)) + return kgid; + gid = from_kgid(mnt_userns, kgid); + if (gid == (gid_t)-1) + return INVALID_GID; + if (initial_idmapping(fs_userns)) + return KGIDT_INIT(gid); + return make_kgid(fs_userns, gid); +} + /** * mapped_fsuid - return caller's fsuid mapped up into a mnt_userns - * @mnt_userns: user namespace of the relevant mount + * @mnt_userns: the mount's idmapping * * Use this helper to initialize a new vfs or filesystem object based on * the caller's fsuid. A common example is initializing the i_uid field of @@ -78,12 +263,12 @@ static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns, */ static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns) { - return kuid_from_mnt(mnt_userns, current_fsuid()); + return mapped_kuid_user(mnt_userns, &init_user_ns, current_fsuid()); } /** * mapped_fsgid - return caller's fsgid mapped up into a mnt_userns - * @mnt_userns: user namespace of the relevant mount + * @mnt_userns: the mount's idmapping * * Use this helper to initialize a new vfs or filesystem object based on * the caller's fsgid. A common example is initializing the i_gid field of @@ -95,7 +280,7 @@ static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns) */ static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns) { - return kgid_from_mnt(mnt_userns, current_fsgid()); + return mapped_kgid_user(mnt_userns, &init_user_ns, current_fsgid()); } #endif /* _LINUX_MNT_IDMAPPING_H */ -- cgit v1.2.3 From 8581fd402a0cf80b5298e3b225e7a7bd8f110e69 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 2 Dec 2021 12:34:00 -0800 Subject: treewide: Add missing includes masked by cgroup -> bpf dependency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cgroup.h (therefore swap.h, therefore half of the universe) includes bpf.h which in turn includes module.h and slab.h. Since we're about to get rid of that dependency we need to clean things up. v2: drop the cpu.h include from cacheinfo.h, it's not necessary and it makes riscv sensitive to ordering of include files. Signed-off-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Reviewed-by: Christoph Hellwig Acked-by: Krzysztof Wilczyński Acked-by: Peter Chen Acked-by: SeongJae Park Acked-by: Jani Nikula Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/all/20211120035253.72074-1-kuba@kernel.org/ # v1 Link: https://lore.kernel.org/all/20211120165528.197359-1-kuba@kernel.org/ # cacheinfo discussion Link: https://lore.kernel.org/bpf/20211202203400.1208663-1-kuba@kernel.org --- include/linux/cacheinfo.h | 1 - include/linux/device/driver.h | 1 + include/linux/filter.h | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 2f909ed084c6..4ff37cb763ae 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -3,7 +3,6 @@ #define _LINUX_CACHEINFO_H #include -#include #include #include diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index a498ebcf4993..15e7c5e15d62 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -18,6 +18,7 @@ #include #include #include +#include /** * enum probe_type - device driver probe type to try diff --git a/include/linux/filter.h b/include/linux/filter.h index 534f678ca50f..7f1e88e3e2b5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -6,6 +6,7 @@ #define __LINUX_FILTER_H__ #include +#include #include #include #include @@ -26,7 +27,6 @@ #include #include -#include struct sk_buff; struct sock; -- cgit v1.2.3 From 4bdcd1dd4d2f973b1a89fb20ba720d879e9e506b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Oct 2021 08:47:05 -0600 Subject: mm: move filemap_range_needs_writeback() into header No functional changes in this patch, just in preparation for efficiently calling this light function from the block O_DIRECT handling. Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 -- include/linux/pagemap.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index bbf812ce89a8..6b8dc1a78df6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2847,8 +2847,6 @@ static inline int filemap_fdatawait(struct address_space *mapping) extern bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); -extern bool filemap_range_needs_writeback(struct address_space *, - loff_t lstart, loff_t lend); extern int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); extern int __filemap_fdatawrite_range(struct address_space *mapping, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 605246452305..274a0710f2c5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -963,6 +963,35 @@ static inline int add_to_page_cache(struct page *page, int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp); +bool filemap_range_has_writeback(struct address_space *mapping, + loff_t start_byte, loff_t end_byte); + +/** + * filemap_range_needs_writeback - check if range potentially needs writeback + * @mapping: address space within which to check + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Find at least one page in the range supplied, usually used to check if + * direct writing in this range will trigger a writeback. Used by O_DIRECT + * read/write with IOCB_NOWAIT, to see if the caller needs to do + * filemap_write_and_wait_range() before proceeding. + * + * Return: %true if the caller should do filemap_write_and_wait_range() before + * doing O_DIRECT to a page in this range, %false otherwise. + */ +static inline bool filemap_range_needs_writeback(struct address_space *mapping, + loff_t start_byte, + loff_t end_byte) +{ + if (!mapping->nrpages) + return false; + if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && + !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) + return false; + return filemap_range_has_writeback(mapping, start_byte, end_byte); +} + /** * struct readahead_control - Describes a readahead request. * -- cgit v1.2.3 From 0a467d0fdd9594fbb449ebc93852533332c528fd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Oct 2021 14:39:59 -0600 Subject: block: switch to atomic_t for request references refcount_t is not as expensive as it used to be, but it's still more expensive than the io_uring method of using atomic_t and just checking for potential over/underflow. This borrows that same implementation, which in turn is based on the mm implementation from Linus. Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1b87b7c8bbff..561beb5be7ec 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -139,7 +139,7 @@ struct request { unsigned short ioprio; enum mq_rq_state state; - refcount_t ref; + atomic_t ref; unsigned long deadline; -- cgit v1.2.3 From 704b914f15fb7daaf517e3acc4bed472b50ca19e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 3 Dec 2021 21:15:32 +0800 Subject: blk-mq: move srcu from blk_mq_hw_ctx to request_queue In case of BLK_MQ_F_BLOCKING, per-hctx srcu is used to protect dispatch critical area. However, this srcu instance stays at the end of hctx, and it often takes standalone cacheline, often cold. Inside srcu_read_lock() and srcu_read_unlock(), WRITE is always done on the indirect percpu variable which is allocated from heap instead of being embedded, srcu->srcu_idx is read only in srcu_read_lock(). It doesn't matter if srcu structure stays in hctx or request queue. So switch to per-request-queue srcu for protecting dispatch, and this way simplifies quiesce a lot, not mention quiesce is always done on the request queue wide. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20211203131534.3668411-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 8 -------- include/linux/blkdev.h | 9 +++++++++ 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 561beb5be7ec..ecdc049b52fa 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -4,7 +4,6 @@ #include #include -#include #include #include #include @@ -375,13 +374,6 @@ struct blk_mq_hw_ctx { * q->unused_hctx_list. */ struct list_head hctx_list; - - /** - * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is - * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also - * blk_mq_hw_ctx_size(). - */ - struct srcu_struct srcu[]; }; /** diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0a4416ef4fbf..c80cfaefc0a8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -16,6 +16,7 @@ #include #include #include +#include struct module; struct request_queue; @@ -373,11 +374,18 @@ struct request_queue { * devices that do not have multiple independent access ranges. */ struct blk_independent_access_ranges *ia_ranges; + + /** + * @srcu: Sleepable RCU. Use as lock when type of the request queue + * is blocking (BLK_MQ_F_BLOCKING). Must be the last member + */ + struct srcu_struct srcu[]; }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ #define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ #define QUEUE_FLAG_DYING 1 /* queue being torn down */ +#define QUEUE_FLAG_HAS_SRCU 2 /* SRCU is allocated */ #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ @@ -415,6 +423,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) +#define blk_queue_has_srcu(q) test_bit(QUEUE_FLAG_HAS_SRCU, &(q)->queue_flags) #define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) -- cgit v1.2.3 From 0cc3a8017900f856f9bf4fdc41c2b5cb1670aabe Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Thu, 2 Dec 2021 13:01:56 -0800 Subject: qed*: enhance tx timeout debug info This patch add some new qed APIs to query status block info and report various data to MFW on tx timeout event Along with that it enhances qede to dump more debug logs (not just specific to the queue which was reported by stack) on tx timeout which includes various other basic metadata about all tx queues and other info (like status block etc.) Signed-off-by: Manish Chopra Signed-off-by: Prabhakar Kushwaha Signed-off-by: Alok Prasad Signed-off-by: Ariel Elior Signed-off-by: Jakub Kicinski --- include/linux/qed/qed_if.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 0dae7fcc5ef2..9f4bfa2a4829 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -807,6 +807,12 @@ struct qed_devlink { struct devlink_health_reporter *fw_reporter; }; +struct qed_sb_info_dbg { + u32 igu_prod; + u32 igu_cons; + u16 pi[PIS_PER_SB]; +}; + struct qed_common_cb_ops { void (*arfs_filter_op)(void *dev, void *fltr, u8 fw_rc); void (*link_update)(void *dev, struct qed_link_output *link); @@ -1194,6 +1200,11 @@ struct qed_common_ops { struct devlink* (*devlink_register)(struct qed_dev *cdev); void (*devlink_unregister)(struct devlink *devlink); + + __printf(2, 3) void (*mfw_report)(struct qed_dev *cdev, char *fmt, ...); + + int (*get_sb_info)(struct qed_dev *cdev, struct qed_sb_info *sb, + u16 qid, struct qed_sb_info_dbg *sb_dbg); }; #define MASK_FIELD(_name, _value) \ -- cgit v1.2.3 From 823163ba6e52e644be5df4539a19e3df8d0988dd Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Thu, 2 Dec 2021 13:01:57 -0800 Subject: qed*: esl priv flag support through ethtool ESL(Enhanced System Lockdown) was designed to lock PCI adapter firmware images and prevent changes to critical non-volatile configuration data so that uncontrolled, malicious or unintentional modification to the adapters are avoided, ensuring it's operational state. Once this feature is enabled, the device is locked, rejecting any modification to non-volatile images. Once unlocked, the protection is off such that firmware and non-volatile configurations may be altered. Driver just reflects the capability and status of this through the ethtool private flag. Signed-off-by: Manish Chopra Signed-off-by: Prabhakar Kushwaha Signed-off-by: Alok Prasad Signed-off-by: Ariel Elior Signed-off-by: Jakub Kicinski --- include/linux/qed/qed_if.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 9f4bfa2a4829..6dc4943d8aec 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -652,6 +652,7 @@ struct qed_dev_info { bool wol_support; bool smart_an; + bool esl; /* MBI version */ u32 mbi_version; @@ -1205,6 +1206,8 @@ struct qed_common_ops { int (*get_sb_info)(struct qed_dev *cdev, struct qed_sb_info *sb, u16 qid, struct qed_sb_info_dbg *sb_dbg); + + int (*get_esl_status)(struct qed_dev *cdev, bool *esl_active); }; #define MASK_FIELD(_name, _value) \ -- cgit v1.2.3 From a3642021923b26d86bb27d88c826494827612c06 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 29 Nov 2021 18:46:47 +0100 Subject: locking/rtmutex: Add rt_mutex_lock_nest_lock() and rt_mutex_lock_killable(). The locking selftest for ww-mutex expects to operate directly on the base-mutex which becomes a rtmutex on PREEMPT_RT. Add a rtmutex based implementation of mutex_lock_nest_lock() and mutex_lock_killable() named rt_mutex_lock_nest_lock() abd rt_mutex_lock_killable(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20211129174654.668506-5-bigeasy@linutronix.de --- include/linux/rtmutex.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 9deedfeec2b1..7d049883a08a 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -99,13 +99,22 @@ extern void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass); +extern void _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock); #define rt_mutex_lock(lock) rt_mutex_lock_nested(lock, 0) +#define rt_mutex_lock_nest_lock(lock, nest_lock) \ + do { \ + typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ + _rt_mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \ + } while (0) + #else extern void rt_mutex_lock(struct rt_mutex *lock); #define rt_mutex_lock_nested(lock, subclass) rt_mutex_lock(lock) +#define rt_mutex_lock_nest_lock(lock, nest_lock) rt_mutex_lock(lock) #endif extern int rt_mutex_lock_interruptible(struct rt_mutex *lock); +extern int rt_mutex_lock_killable(struct rt_mutex *lock); extern int rt_mutex_trylock(struct rt_mutex *lock); extern void rt_mutex_unlock(struct rt_mutex *lock); -- cgit v1.2.3 From 0c1d7a2c2d32fac7ff4a644724b2d52a64184645 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Nov 2021 18:46:48 +0100 Subject: lockdep: Remove softirq accounting on PREEMPT_RT. There is not really a softirq context on PREEMPT_RT. Softirqs on PREEMPT_RT are always invoked within the context of a threaded interrupt handler or within ksoftirqd. The "in-softirq" context is preemptible and is protected by a per-CPU lock to ensure mutual exclusion. There is no difference on PREEMPT_RT between spin_lock_irq() and spin_lock() because the former does not disable interrupts. Therefore if a lock is used in_softirq() and locked once with spin_lock_irq() then lockdep will report this with "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage". Teach lockdep that we don't really do softirqs on -RT. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20211129174654.668506-6-bigeasy@linutronix.de --- include/linux/irqflags.h | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 600c10da321a..4b140938b03e 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -71,14 +71,6 @@ do { \ do { \ __this_cpu_dec(hardirq_context); \ } while (0) -# define lockdep_softirq_enter() \ -do { \ - current->softirq_context++; \ -} while (0) -# define lockdep_softirq_exit() \ -do { \ - current->softirq_context--; \ -} while (0) # define lockdep_hrtimer_enter(__hrtimer) \ ({ \ @@ -140,6 +132,21 @@ do { \ # define lockdep_irq_work_exit(__work) do { } while (0) #endif +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT) +# define lockdep_softirq_enter() \ +do { \ + current->softirq_context++; \ +} while (0) +# define lockdep_softirq_exit() \ +do { \ + current->softirq_context--; \ +} while (0) + +#else +# define lockdep_softirq_enter() do { } while (0) +# define lockdep_softirq_exit() do { } while (0) +#endif + #if defined(CONFIG_IRQSOFF_TRACER) || \ defined(CONFIG_PREEMPT_TRACER) extern void stop_critical_timings(void); -- cgit v1.2.3 From c0bed69daf4b67809b58cc7cd81a8fa4f45bc161 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 3 Dec 2021 15:59:34 +0800 Subject: locking: Make owner_on_cpu() into Move the owner_on_cpu() from kernel/locking/rwsem.c into include/linux/sched.h with under CONFIG_SMP, then use it in the mutex/rwsem/rtmutex to simplify the code. Signed-off-by: Kefeng Wang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20211203075935.136808-2-wangkefeng.wang@huawei.com --- include/linux/sched.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 78c351e35fec..ff609d9c2f21 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2171,6 +2171,15 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #endif #ifdef CONFIG_SMP +static inline bool owner_on_cpu(struct task_struct *owner) +{ + /* + * As lock holder preemption issue, we both skip spinning if + * task is not on cpu or its cpu is preempted + */ + return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); +} + /* Returns effective CPU energy utilization, as seen by the scheduler */ unsigned long sched_cpu_util(int cpu, unsigned long max); #endif /* CONFIG_SMP */ -- cgit v1.2.3 From 4cf75fd4a2545ca4deea992f929602c9fdbe8058 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 3 Dec 2021 15:59:35 +0800 Subject: locking: Mark racy reads of owner->on_cpu One of the more frequent data races reported by KCSAN is the racy read in mutex_spin_on_owner(), which is usually reported as "race of unknown origin" without showing the writer. This is due to the racing write occurring in kernel/sched. Locally enabling KCSAN in kernel/sched shows: | write (marked) to 0xffff97f205079934 of 4 bytes by task 316 on cpu 6: | finish_task kernel/sched/core.c:4632 [inline] | finish_task_switch kernel/sched/core.c:4848 | context_switch kernel/sched/core.c:4975 [inline] | __schedule kernel/sched/core.c:6253 | schedule kernel/sched/core.c:6326 | schedule_preempt_disabled kernel/sched/core.c:6385 | __mutex_lock_common kernel/locking/mutex.c:680 | __mutex_lock kernel/locking/mutex.c:740 [inline] | __mutex_lock_slowpath kernel/locking/mutex.c:1028 | mutex_lock kernel/locking/mutex.c:283 | tty_open_by_driver drivers/tty/tty_io.c:2062 [inline] | ... | | read to 0xffff97f205079934 of 4 bytes by task 322 on cpu 3: | mutex_spin_on_owner kernel/locking/mutex.c:370 | mutex_optimistic_spin kernel/locking/mutex.c:480 | __mutex_lock_common kernel/locking/mutex.c:610 | __mutex_lock kernel/locking/mutex.c:740 [inline] | __mutex_lock_slowpath kernel/locking/mutex.c:1028 | mutex_lock kernel/locking/mutex.c:283 | tty_open_by_driver drivers/tty/tty_io.c:2062 [inline] | ... | | value changed: 0x00000001 -> 0x00000000 This race is clearly intentional, and the potential for miscompilation is slim due to surrounding barrier() and cpu_relax(), and the value being used as a boolean. Nevertheless, marking this reader would more clearly denote intent and make it obvious that concurrency is expected. Use READ_ONCE() to avoid having to reason about compiler optimizations now and in future. With previous refactor, mark the read to owner->on_cpu in owner_on_cpu(), which immediately precedes the loop executing mutex_spin_on_owner(). Signed-off-by: Marco Elver Signed-off-by: Kefeng Wang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20211203075935.136808-3-wangkefeng.wang@huawei.com --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ff609d9c2f21..0b9b0e3f4791 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2177,7 +2177,7 @@ static inline bool owner_on_cpu(struct task_struct *owner) * As lock holder preemption issue, we both skip spinning if * task is not on cpu or its cpu is preempted */ - return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); + return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner)); } /* Returns effective CPU energy utilization, as seen by the scheduler */ -- cgit v1.2.3 From fb08a1908cb119a4585611d91461ab6d27756b14 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Nov 2021 11:21:38 +0100 Subject: dax: simplify the dax_device <-> gendisk association Replace the dax_host_hash with an xarray indexed by the pointer value of the gendisk, and require explicitly calls from the block drivers that want to associate their gendisk with a dax_device. Signed-off-by: Christoph Hellwig Acked-by: Mike Snitzer Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20211129102203.2243509-5-hch@lst.de Signed-off-by: Dan Williams --- include/linux/dax.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 8623caa67388..e2e9a67004cb 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -11,9 +11,11 @@ typedef unsigned long dax_entry_t; +struct dax_device; +struct gendisk; struct iomap_ops; struct iomap; -struct dax_device; + struct dax_operations { /* * direct_access: translate a device-relative @@ -39,8 +41,8 @@ struct dax_operations { }; #if IS_ENABLED(CONFIG_DAX) -struct dax_device *alloc_dax(void *private, const char *host, - const struct dax_operations *ops, unsigned long flags); +struct dax_device *alloc_dax(void *private, const struct dax_operations *ops, + unsigned long flags); void put_dax(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); void dax_write_cache(struct dax_device *dax_dev, bool wc); @@ -68,7 +70,7 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, return dax_synchronous(dax_dev); } #else -static inline struct dax_device *alloc_dax(void *private, const char *host, +static inline struct dax_device *alloc_dax(void *private, const struct dax_operations *ops, unsigned long flags) { /* @@ -107,6 +109,8 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, struct writeback_control; int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); #if IS_ENABLED(CONFIG_FS_DAX) +int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); +void dax_remove_host(struct gendisk *disk); bool generic_fsdax_supported(struct dax_device *dax_dev, struct block_device *bdev, int blocksize, sector_t start, sector_t sectors); @@ -128,6 +132,13 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t st dax_entry_t dax_lock_page(struct page *page); void dax_unlock_page(struct page *page, dax_entry_t cookie); #else +static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) +{ + return 0; +} +static inline void dax_remove_host(struct gendisk *disk) +{ +} #define generic_fsdax_supported NULL static inline bool dax_supported(struct dax_device *dax_dev, -- cgit v1.2.3 From 7b0800d00dae8c897398abaf61e82db0d67d7afc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Nov 2021 11:21:42 +0100 Subject: dax: remove dax_capable Just open code the block size and dax_dev == NULL checks in the callers. Signed-off-by: Christoph Hellwig Acked-by: Mike Snitzer Reviewed-by: Gao Xiang [erofs] Reviewed-by: Dan Williams Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20211129102203.2243509-9-hch@lst.de Signed-off-by: Dan Williams --- include/linux/dax.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index e2e9a67004cb..439c3c70e347 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -111,12 +111,6 @@ int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); #if IS_ENABLED(CONFIG_FS_DAX) int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); void dax_remove_host(struct gendisk *disk); -bool generic_fsdax_supported(struct dax_device *dax_dev, - struct block_device *bdev, int blocksize, sector_t start, - sector_t sectors); - -bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, - int blocksize, sector_t start, sector_t len); static inline void fs_put_dax(struct dax_device *dax_dev) { @@ -139,14 +133,6 @@ static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) static inline void dax_remove_host(struct gendisk *disk) { } -#define generic_fsdax_supported NULL - -static inline bool dax_supported(struct dax_device *dax_dev, - struct block_device *bdev, int blocksize, sector_t start, - sector_t len) -{ - return false; -} static inline void fs_put_dax(struct dax_device *dax_dev) { -- cgit v1.2.3 From 60696eb26a37ab0199f7833ddbc1b75138c36d16 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Nov 2021 11:21:48 +0100 Subject: fsdax: simplify the pgoff calculation Replace the two steps of dax_iomap_sector and bdev_dax_pgoff with a single dax_iomap_pgoff helper that avoids lots of cumbersome sector conversions. Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20211129102203.2243509-15-hch@lst.de Signed-off-by: Dan Williams --- include/linux/dax.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 439c3c70e347..324363b798ec 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -107,7 +107,6 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, #endif struct writeback_control; -int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); #if IS_ENABLED(CONFIG_FS_DAX) int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); void dax_remove_host(struct gendisk *disk); -- cgit v1.2.3 From c6f40468657d16e4010ef84bf32a761feb3469ea Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Nov 2021 11:21:52 +0100 Subject: fsdax: decouple zeroing from the iomap buffered I/O code Unshare the DAX and iomap buffered I/O page zeroing code. This code previously did a IS_DAX check deep inside the iomap code, which in fact was the only DAX check in the code. Instead move these checks into the callers. Most callers already have DAX special casing anyway and XFS will need it for reflink support as well. Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20211129102203.2243509-19-hch@lst.de Signed-off-by: Dan Williams --- include/linux/dax.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 324363b798ec..b79036743e7f 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -14,6 +14,7 @@ typedef unsigned long dax_entry_t; struct dax_device; struct gendisk; struct iomap_ops; +struct iomap_iter; struct iomap; struct dax_operations { @@ -170,6 +171,11 @@ static inline void dax_unlock_page(struct page *page, dax_entry_t cookie) } #endif +int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, + const struct iomap_ops *ops); +int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, + const struct iomap_ops *ops); + #if IS_ENABLED(CONFIG_DAX) int dax_read_lock(void); void dax_read_unlock(int id); @@ -204,7 +210,6 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap); static inline bool dax_mapping(struct address_space *mapping) { return mapping->host && IS_DAX(mapping->host); -- cgit v1.2.3 From 952da06375c8f3aa58474fff718d9ae8442531b9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Nov 2021 11:21:58 +0100 Subject: iomap: add a IOMAP_DAX flag Add a flag so that the file system can easily detect DAX operations based just on the iomap operation requested instead of looking at inode state using IS_DAX. This will be needed to apply the to be added partition offset only for operations that actually use DAX, but not things like fiemap that are based on the block device. In the long run it should also allow turning the bdev, dax_dev and inline_data into a union. Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20211129102203.2243509-25-hch@lst.de Signed-off-by: Dan Williams --- include/linux/iomap.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 6d1b08d0ae93..5b9432f9f79e 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -141,6 +141,11 @@ struct iomap_page_ops { #define IOMAP_NOWAIT (1 << 5) /* do not block */ #define IOMAP_OVERWRITE_ONLY (1 << 6) /* only pure overwrites allowed */ #define IOMAP_UNSHARE (1 << 7) /* unshare_file_range */ +#ifdef CONFIG_FS_DAX +#define IOMAP_DAX (1 << 8) /* DAX mapping */ +#else +#define IOMAP_DAX 0 +#endif /* CONFIG_FS_DAX */ struct iomap_ops { /* -- cgit v1.2.3 From cd913c76f489def1a388e3a5b10df94948ede3f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Nov 2021 11:21:59 +0100 Subject: dax: return the partition offset from fs_dax_get_by_bdev Prepare for the removal of the block_device from the DAX I/O path by returning the partition offset from fs_dax_get_by_bdev so that the file systems have it at hand for use during I/O. Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20211129102203.2243509-26-hch@lst.de Signed-off-by: Dan Williams --- include/linux/dax.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index b79036743e7f..f6f353382cc9 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -117,7 +117,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev) put_dax(dax_dev); } -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev); +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, + u64 *start_off); int dax_writeback_mapping_range(struct address_space *mapping, struct dax_device *dax_dev, struct writeback_control *wbc); @@ -138,7 +139,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev) { } -static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) +static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, + u64 *start_off) { return NULL; } -- cgit v1.2.3 From 2ede892342b3c628991ff1b9060108a7edd92d94 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Nov 2021 11:22:01 +0100 Subject: dax: fix up some of the block device related ifdefs The DAX device <-> block device association is only enabled if CONFIG_BLOCK is enabled. Update dax.h to account for that and use the right conditions for the fs_put_dax stub as well. Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20211129102203.2243509-28-hch@lst.de Signed-off-by: Dan Williams --- include/linux/dax.h | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index f6f353382cc9..87ae4c9b1d65 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -108,24 +108,15 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, #endif struct writeback_control; -#if IS_ENABLED(CONFIG_FS_DAX) +#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); void dax_remove_host(struct gendisk *disk); - +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, + u64 *start_off); static inline void fs_put_dax(struct dax_device *dax_dev) { put_dax(dax_dev); } - -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, - u64 *start_off); -int dax_writeback_mapping_range(struct address_space *mapping, - struct dax_device *dax_dev, struct writeback_control *wbc); - -struct page *dax_layout_busy_page(struct address_space *mapping); -struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end); -dax_entry_t dax_lock_page(struct page *page); -void dax_unlock_page(struct page *page, dax_entry_t cookie); #else static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) { @@ -134,17 +125,25 @@ static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) static inline void dax_remove_host(struct gendisk *disk) { } - -static inline void fs_put_dax(struct dax_device *dax_dev) -{ -} - static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off) { return NULL; } +static inline void fs_put_dax(struct dax_device *dax_dev) +{ +} +#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ +#if IS_ENABLED(CONFIG_FS_DAX) +int dax_writeback_mapping_range(struct address_space *mapping, + struct dax_device *dax_dev, struct writeback_control *wbc); + +struct page *dax_layout_busy_page(struct address_space *mapping); +struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end); +dax_entry_t dax_lock_page(struct page *page); +void dax_unlock_page(struct page *page, dax_entry_t cookie); +#else static inline struct page *dax_layout_busy_page(struct address_space *mapping) { return NULL; -- cgit v1.2.3 From 866de407444398bc8140ea70de1dba5f91cc34ac Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 3 Dec 2021 13:30:01 +0800 Subject: bpf: Disallow BPF_LOG_KERNEL log level for bpf(BPF_BTF_LOAD) BPF_LOG_KERNEL is only used internally, so disallow bpf_btf_load() to set log level as BPF_LOG_KERNEL. The same checking has already been done in bpf_check(), so factor out a helper to check the validity of log attributes and use it in both places. Fixes: 8580ac9404f6 ("bpf: Process in-kernel BTF") Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20211203053001.740945-1-houtao1@huawei.com --- include/linux/bpf_verifier.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c8a78e830fca..182b16a91084 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -396,6 +396,13 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) log->level == BPF_LOG_KERNEL); } +static inline bool +bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) +{ + return log->len_total >= 128 && log->len_total <= UINT_MAX >> 2 && + log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK); +} + #define BPF_MAX_SUBPROGS 256 struct bpf_subprog_info { -- cgit v1.2.3 From b80892ca022e9eb484771a66eb68e12364695a2a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Oct 2021 17:10:17 +0200 Subject: memremap: remove support for external pgmap refcounts No driver is left using the external pgmap refcount, so remove the code to support it. Signed-off-by: Christoph Hellwig Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20211028151017.50234-1-hch@lst.de Signed-off-by: Dan Williams --- include/linux/memremap.h | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index c0e9d35889e8..a8bc588fe7aa 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -72,16 +72,6 @@ struct dev_pagemap_ops { */ void (*page_free)(struct page *page); - /* - * Transition the refcount in struct dev_pagemap to the dead state. - */ - void (*kill)(struct dev_pagemap *pgmap); - - /* - * Wait for refcount in struct dev_pagemap to be idle and reap it. - */ - void (*cleanup)(struct dev_pagemap *pgmap); - /* * Used for private (un-addressable) device memory only. Must migrate * the page back to a CPU accessible page. @@ -95,8 +85,7 @@ struct dev_pagemap_ops { * struct dev_pagemap - metadata for ZONE_DEVICE mappings * @altmap: pre-allocated/reserved memory for vmemmap allocations * @ref: reference count that pins the devm_memremap_pages() mapping - * @internal_ref: internal reference if @ref is not provided by the caller - * @done: completion for @internal_ref + * @done: completion for @ref * @type: memory type: see MEMORY_* in memory_hotplug.h * @flags: PGMAP_* flags to specify defailed behavior * @ops: method table @@ -109,8 +98,7 @@ struct dev_pagemap_ops { */ struct dev_pagemap { struct vmem_altmap altmap; - struct percpu_ref *ref; - struct percpu_ref internal_ref; + struct percpu_ref ref; struct completion done; enum memory_type type; unsigned int flags; @@ -191,7 +179,7 @@ static inline unsigned long memremap_compat_align(void) static inline void put_dev_pagemap(struct dev_pagemap *pgmap) { if (pgmap) - percpu_ref_put(pgmap->ref); + percpu_ref_put(&pgmap->ref); } #endif /* _LINUX_MEMREMAP_H_ */ -- cgit v1.2.3 From 02e4079913500f24ceb082d8d87d8665f044b298 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 3 Dec 2021 12:17:04 +0100 Subject: fs: remove unused low-level mapping helpers Now that we ported all places to use the new low-level mapping helpers that are able to support filesystems mounted with an idmapping we can remove the old low-level mapping helpers. With the removal of these old helpers we also conclude the renaming of the mapping helpers we started in commit a65e58e791a1 ("fs: document and rename fsid helpers"). Link: https://lore.kernel.org/r/20211123114227.3124056-8-brauner@kernel.org (v1) Link: https://lore.kernel.org/r/20211130121032.3753852-8-brauner@kernel.org (v2) Link: https://lore.kernel.org/r/20211203111707.3901969-8-brauner@kernel.org Cc: Seth Forshee Cc: Christoph Hellwig Cc: Al Viro CC: linux-fsdevel@vger.kernel.org Reviewed-by: Amir Goldstein Reviewed-by: Seth Forshee Signed-off-by: Christian Brauner --- include/linux/mnt_idmapping.h | 56 ------------------------------------------- 1 file changed, 56 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index 60341cd33ccc..0c6ab3f4c952 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -13,62 +13,6 @@ struct user_namespace; */ extern struct user_namespace init_user_ns; -/** - * kuid_into_mnt - map a kuid down into a mnt_userns - * @mnt_userns: user namespace of the relevant mount - * @kuid: kuid to be mapped - * - * Return: @kuid mapped according to @mnt_userns. - * If @kuid has no mapping INVALID_UID is returned. - */ -static inline kuid_t kuid_into_mnt(struct user_namespace *mnt_userns, - kuid_t kuid) -{ - return make_kuid(mnt_userns, __kuid_val(kuid)); -} - -/** - * kgid_into_mnt - map a kgid down into a mnt_userns - * @mnt_userns: user namespace of the relevant mount - * @kgid: kgid to be mapped - * - * Return: @kgid mapped according to @mnt_userns. - * If @kgid has no mapping INVALID_GID is returned. - */ -static inline kgid_t kgid_into_mnt(struct user_namespace *mnt_userns, - kgid_t kgid) -{ - return make_kgid(mnt_userns, __kgid_val(kgid)); -} - -/** - * kuid_from_mnt - map a kuid up into a mnt_userns - * @mnt_userns: user namespace of the relevant mount - * @kuid: kuid to be mapped - * - * Return: @kuid mapped up according to @mnt_userns. - * If @kuid has no mapping INVALID_UID is returned. - */ -static inline kuid_t kuid_from_mnt(struct user_namespace *mnt_userns, - kuid_t kuid) -{ - return KUIDT_INIT(from_kuid(mnt_userns, kuid)); -} - -/** - * kgid_from_mnt - map a kgid up into a mnt_userns - * @mnt_userns: user namespace of the relevant mount - * @kgid: kgid to be mapped - * - * Return: @kgid mapped up according to @mnt_userns. - * If @kgid has no mapping INVALID_GID is returned. - */ -static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns, - kgid_t kgid) -{ - return KGIDT_INIT(from_kgid(mnt_userns, kgid)); -} - /** * initial_idmapping - check whether this is the initial mapping * @ns: idmapping to check -- cgit v1.2.3 From 209188ce75d0d357c292f6bb81d712acdd4e7db7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 3 Dec 2021 12:17:05 +0100 Subject: fs: port higher-level mapping helpers Enable the mapped_fs{g,u}id() helpers to support filesystems mounted with an idmapping. Apart from core mapping helpers that use mapped_fs{g,u}id() to initialize struct inode's i_{g,u}id fields xfs is the only place that uses these low-level helpers directly. The patch only extends the helpers to be able to take the filesystem idmapping into account. Since we don't actually yet pass the filesystem's idmapping in no functional changes happen. This will happen in a final patch. Link: https://lore.kernel.org/r/20211123114227.3124056-9-brauner@kernel.org (v1) Link: https://lore.kernel.org/r/20211130121032.3753852-9-brauner@kernel.org (v2) Link: https://lore.kernel.org/r/20211203111707.3901969-9-brauner@kernel.org Cc: Seth Forshee Cc: Christoph Hellwig Cc: Al Viro CC: linux-fsdevel@vger.kernel.org Reviewed-by: Amir Goldstein Reviewed-by: Seth Forshee Signed-off-by: Christian Brauner --- include/linux/fs.h | 8 ++++---- include/linux/mnt_idmapping.h | 12 ++++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 57aee6ebba72..e1f28f757f1b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1664,7 +1664,7 @@ static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns, static inline void inode_fsuid_set(struct inode *inode, struct user_namespace *mnt_userns) { - inode->i_uid = mapped_fsuid(mnt_userns); + inode->i_uid = mapped_fsuid(mnt_userns, &init_user_ns); } /** @@ -1678,7 +1678,7 @@ static inline void inode_fsuid_set(struct inode *inode, static inline void inode_fsgid_set(struct inode *inode, struct user_namespace *mnt_userns) { - inode->i_gid = mapped_fsgid(mnt_userns); + inode->i_gid = mapped_fsgid(mnt_userns, &init_user_ns); } /** @@ -1699,10 +1699,10 @@ static inline bool fsuidgid_has_mapping(struct super_block *sb, kuid_t kuid; kgid_t kgid; - kuid = mapped_fsuid(mnt_userns); + kuid = mapped_fsuid(mnt_userns, &init_user_ns); if (!uid_valid(kuid)) return false; - kgid = mapped_fsgid(mnt_userns); + kgid = mapped_fsgid(mnt_userns, &init_user_ns); if (!gid_valid(kgid)) return false; return kuid_has_mapping(fs_userns, kuid) && diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index 0c6ab3f4c952..ee5a217de2a8 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -196,6 +196,7 @@ static inline kgid_t mapped_kgid_user(struct user_namespace *mnt_userns, /** * mapped_fsuid - return caller's fsuid mapped up into a mnt_userns * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping * * Use this helper to initialize a new vfs or filesystem object based on * the caller's fsuid. A common example is initializing the i_uid field of @@ -205,14 +206,16 @@ static inline kgid_t mapped_kgid_user(struct user_namespace *mnt_userns, * * Return: the caller's current fsuid mapped up according to @mnt_userns. */ -static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns) +static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns) { - return mapped_kuid_user(mnt_userns, &init_user_ns, current_fsuid()); + return mapped_kuid_user(mnt_userns, fs_userns, current_fsuid()); } /** * mapped_fsgid - return caller's fsgid mapped up into a mnt_userns * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping * * Use this helper to initialize a new vfs or filesystem object based on * the caller's fsgid. A common example is initializing the i_gid field of @@ -222,9 +225,10 @@ static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns) * * Return: the caller's current fsgid mapped up according to @mnt_userns. */ -static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns) +static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns) { - return mapped_kgid_user(mnt_userns, &init_user_ns, current_fsgid()); + return mapped_kgid_user(mnt_userns, fs_userns, current_fsgid()); } #endif /* _LINUX_MNT_IDMAPPING_H */ -- cgit v1.2.3 From a1ec9040a2a9122605ac26e5725c6de019184419 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 3 Dec 2021 12:17:06 +0100 Subject: fs: add i_user_ns() helper Since we'll be passing the filesystem's idmapping in even more places in the following patches and we do already dereference struct inode to get to the filesystem's idmapping multiple times add a tiny helper. Link: https://lore.kernel.org/r/20211123114227.3124056-10-brauner@kernel.org (v1) Link: https://lore.kernel.org/r/20211130121032.3753852-10-brauner@kernel.org (v2) Link: https://lore.kernel.org/r/20211203111707.3901969-10-brauner@kernel.org Cc: Seth Forshee Cc: Christoph Hellwig Cc: Al Viro CC: linux-fsdevel@vger.kernel.org Reviewed-by: Amir Goldstein Reviewed-by: Seth Forshee Signed-off-by: Christian Brauner --- include/linux/fs.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e1f28f757f1b..3d6d514943ab 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1600,6 +1600,11 @@ struct super_block { struct list_head s_inodes_wb; /* writeback inodes */ } __randomize_layout; +static inline struct user_namespace *i_user_ns(const struct inode *inode) +{ + return inode->i_sb->s_user_ns; +} + /* Helper functions so that in most cases filesystems will * not need to deal directly with kuid_t and kgid_t and can * instead deal with the raw numeric values that are stored @@ -1607,22 +1612,22 @@ struct super_block { */ static inline uid_t i_uid_read(const struct inode *inode) { - return from_kuid(inode->i_sb->s_user_ns, inode->i_uid); + return from_kuid(i_user_ns(inode), inode->i_uid); } static inline gid_t i_gid_read(const struct inode *inode) { - return from_kgid(inode->i_sb->s_user_ns, inode->i_gid); + return from_kgid(i_user_ns(inode), inode->i_gid); } static inline void i_uid_write(struct inode *inode, uid_t uid) { - inode->i_uid = make_kuid(inode->i_sb->s_user_ns, uid); + inode->i_uid = make_kuid(i_user_ns(inode), uid); } static inline void i_gid_write(struct inode *inode, gid_t gid) { - inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid); + inode->i_gid = make_kgid(i_user_ns(inode), gid); } /** -- cgit v1.2.3 From bd303368b776eead1c29e6cdda82bde7128b82a7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 3 Dec 2021 12:17:07 +0100 Subject: fs: support mapped mounts of mapped filesystems In previous patches we added new and modified existing helpers to handle idmapped mounts of filesystems mounted with an idmapping. In this final patch we convert all relevant places in the vfs to actually pass the filesystem's idmapping into these helpers. With this the vfs is in shape to handle idmapped mounts of filesystems mounted with an idmapping. Note that this is just the generic infrastructure. Actually adding support for idmapped mounts to a filesystem mountable with an idmapping is follow-up work. In this patch we extend the definition of an idmapped mount from a mount that that has the initial idmapping attached to it to a mount that has an idmapping attached to it which is not the same as the idmapping the filesystem was mounted with. As before we do not allow the initial idmapping to be attached to a mount. In addition this patch prevents that the idmapping the filesystem was mounted with can be attached to a mount created based on this filesystem. This has multiple reasons and advantages. First, attaching the initial idmapping or the filesystem's idmapping doesn't make much sense as in both cases the values of the i_{g,u}id and other places where k{g,u}ids are used do not change. Second, a user that really wants to do this for whatever reason can just create a separate dedicated identical idmapping to attach to the mount. Third, we can continue to use the initial idmapping as an indicator that a mount is not idmapped allowing us to continue to keep passing the initial idmapping into the mapping helpers to tell them that something isn't an idmapped mount even if the filesystem is mounted with an idmapping. Link: https://lore.kernel.org/r/20211123114227.3124056-11-brauner@kernel.org (v1) Link: https://lore.kernel.org/r/20211130121032.3753852-11-brauner@kernel.org (v2) Link: https://lore.kernel.org/r/20211203111707.3901969-11-brauner@kernel.org Cc: Seth Forshee Cc: Amir Goldstein Cc: Christoph Hellwig Cc: Al Viro CC: linux-fsdevel@vger.kernel.org Reviewed-by: Seth Forshee Signed-off-by: Christian Brauner --- include/linux/fs.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3d6d514943ab..493b87e3616b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1641,7 +1641,7 @@ static inline void i_gid_write(struct inode *inode, gid_t gid) static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns, const struct inode *inode) { - return mapped_kuid_fs(mnt_userns, &init_user_ns, inode->i_uid); + return mapped_kuid_fs(mnt_userns, i_user_ns(inode), inode->i_uid); } /** @@ -1655,7 +1655,7 @@ static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns, static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns, const struct inode *inode) { - return mapped_kgid_fs(mnt_userns, &init_user_ns, inode->i_gid); + return mapped_kgid_fs(mnt_userns, i_user_ns(inode), inode->i_gid); } /** @@ -1669,7 +1669,7 @@ static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns, static inline void inode_fsuid_set(struct inode *inode, struct user_namespace *mnt_userns) { - inode->i_uid = mapped_fsuid(mnt_userns, &init_user_ns); + inode->i_uid = mapped_fsuid(mnt_userns, i_user_ns(inode)); } /** @@ -1683,7 +1683,7 @@ static inline void inode_fsuid_set(struct inode *inode, static inline void inode_fsgid_set(struct inode *inode, struct user_namespace *mnt_userns) { - inode->i_gid = mapped_fsgid(mnt_userns, &init_user_ns); + inode->i_gid = mapped_fsgid(mnt_userns, i_user_ns(inode)); } /** @@ -1704,10 +1704,10 @@ static inline bool fsuidgid_has_mapping(struct super_block *sb, kuid_t kuid; kgid_t kgid; - kuid = mapped_fsuid(mnt_userns, &init_user_ns); + kuid = mapped_fsuid(mnt_userns, fs_userns); if (!uid_valid(kuid)) return false; - kgid = mapped_fsgid(mnt_userns, &init_user_ns); + kgid = mapped_fsgid(mnt_userns, fs_userns); if (!gid_valid(kgid)) return false; return kuid_has_mapping(fs_userns, kuid) && @@ -2653,13 +2653,14 @@ static inline struct user_namespace *file_mnt_user_ns(struct file *file) * is_idmapped_mnt - check whether a mount is mapped * @mnt: the mount to check * - * If @mnt has an idmapping attached to it @mnt is mapped. + * If @mnt has an idmapping attached different from the + * filesystem's idmapping then @mnt is mapped. * * Return: true if mount is mapped, false if not. */ static inline bool is_idmapped_mnt(const struct vfsmount *mnt) { - return mnt_user_ns(mnt) != &init_user_ns; + return mnt_user_ns(mnt) != mnt->mnt_sb->s_user_ns; } extern long vfs_truncate(const struct path *, loff_t); -- cgit v1.2.3 From 019cd8a9e3bcbbf6bac8036a9ae545f7858e0c08 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Tue, 2 Nov 2021 16:02:01 -0600 Subject: ARM: ixp4xx: remove unused header file pata_ixp4xx_cf.h Commit b00ced38e317 ("ARM: ixp4xx: Delete Avila boardfiles") removed the last use of but left the header file in place. Nothing uses this file, delete it now. Cc: Linus Walleij Cc: Arnd Bergmann Signed-off-by: Jonathan Corbet Acked-by: Arnd Bergmann Signed-off-by: Linus Walleij --- include/linux/platform_data/pata_ixp4xx_cf.h | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 include/linux/platform_data/pata_ixp4xx_cf.h (limited to 'include/linux') diff --git a/include/linux/platform_data/pata_ixp4xx_cf.h b/include/linux/platform_data/pata_ixp4xx_cf.h deleted file mode 100644 index e60fa41da4a5..000000000000 --- a/include/linux/platform_data/pata_ixp4xx_cf.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __PLATFORM_DATA_PATA_IXP4XX_H -#define __PLATFORM_DATA_PATA_IXP4XX_H - -#include - -/* - * This structure provide a means for the board setup code - * to give information to th pata_ixp4xx driver. It is - * passed as platform_data. - */ -struct ixp4xx_pata_data { - volatile u32 *cs0_cfg; - volatile u32 *cs1_cfg; - unsigned long cs0_bits; - unsigned long cs1_bits; - void __iomem *cmd; - void __iomem *ctl; -}; - -#endif -- cgit v1.2.3 From 52f982f00b220d097a71a23c149a1d18efc08e63 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Mon, 6 Dec 2021 14:24:06 +0100 Subject: security,selinux: remove security_add_mnt_opt() Its last user has been removed in commit f2aedb713c28 ("NFS: Add fs_context support."). Signed-off-by: Ondrej Mosnacek Reviewed-by: Casey Schaufler Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 -- include/linux/lsm_hooks.h | 2 -- include/linux/security.h | 8 -------- 3 files changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index ae2228f0711d..a5a724c308d8 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -78,8 +78,6 @@ LSM_HOOK(int, 0, sb_set_mnt_opts, struct super_block *sb, void *mnt_opts, LSM_HOOK(int, 0, sb_clone_mnt_opts, const struct super_block *oldsb, struct super_block *newsb, unsigned long kern_flags, unsigned long *set_kern_flags) -LSM_HOOK(int, 0, sb_add_mnt_opt, const char *option, const char *val, - int len, void **mnt_opts) LSM_HOOK(int, 0, move_mount, const struct path *from_path, const struct path *to_path) LSM_HOOK(int, 0, dentry_init_security, struct dentry *dentry, diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 52c1990644b9..3bf5c658bc44 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -180,8 +180,6 @@ * Copy all security options from a given superblock to another * @oldsb old superblock which contain information to clone * @newsb new superblock which needs filled in - * @sb_add_mnt_opt: - * Add one mount @option to @mnt_opts. * @sb_parse_opts_str: * Parse a string of security data filling in the opts structure * @options string containing all mount options known by the LSM diff --git a/include/linux/security.h b/include/linux/security.h index bb301963e333..6d72772182c8 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -313,8 +313,6 @@ int security_sb_clone_mnt_opts(const struct super_block *oldsb, struct super_block *newsb, unsigned long kern_flags, unsigned long *set_kern_flags); -int security_add_mnt_opt(const char *option, const char *val, - int len, void **mnt_opts); int security_move_mount(const struct path *from_path, const struct path *to_path); int security_dentry_init_security(struct dentry *dentry, int mode, const struct qstr *name, @@ -711,12 +709,6 @@ static inline int security_sb_clone_mnt_opts(const struct super_block *oldsb, return 0; } -static inline int security_add_mnt_opt(const char *option, const char *val, - int len, void **mnt_opts) -{ - return 0; -} - static inline int security_move_mount(const struct path *from_path, const struct path *to_path) { -- cgit v1.2.3 From 8ab30a331946c34e4ba022c44df8624acea1c74e Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 6 Dec 2021 20:49:48 +0800 Subject: blk-mq: Drop busy_iter_fn blk_mq_hw_ctx argument The only user of blk_mq_hw_ctx blk_mq_hw_ctx argument is blk_mq_rq_inflight(). Function blk_mq_rq_inflight() uses the hctx to find the associated request queue to match against the request. However this same check is already done in caller bt_iter(), so drop this check. With that change there are no more users of busy_iter_fn blk_mq_hw_ctx argument, so drop the argument. Reviewed-by Hannes Reinecke Signed-off-by: John Garry Reviewed-by: Ming Lei Tested-by: Kashyap Desai Link: https://lore.kernel.org/r/1638794990-137490-2-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ecdc049b52fa..17ebf29e42d8 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -470,8 +470,7 @@ struct blk_mq_queue_data { bool last; }; -typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, - bool); +typedef bool (busy_iter_fn)(struct request *, void *, bool); typedef bool (busy_tag_iter_fn)(struct request *, void *, bool); /** -- cgit v1.2.3 From fc39f8d2d1c10ac04976b0a247865bb0cec4dd88 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 6 Dec 2021 20:49:49 +0800 Subject: blk-mq: Delete busy_iter_fn Typedefs busy_iter_fn and busy_tag_iter_fn are now identical, so delete busy_iter_fn to reduce duplication. It would be nicer to delete busy_tag_iter_fn, as the name busy_iter_fn is less specific. However busy_tag_iter_fn is used in many different parts of the tree, unlike busy_iter_fn which is just use in block/, so just take the straightforward path now, so that we could rename later treewide. Signed-off-by: John Garry Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Tested-by: Kashyap Desai Link: https://lore.kernel.org/r/1638794990-137490-3-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 17ebf29e42d8..772f8f921526 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -470,7 +470,6 @@ struct blk_mq_queue_data { bool last; }; -typedef bool (busy_iter_fn)(struct request *, void *, bool); typedef bool (busy_tag_iter_fn)(struct request *, void *, bool); /** -- cgit v1.2.3 From 05770dd0ad110854c7157d95700d7c89979cdb3e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 22 Nov 2021 18:30:12 +0900 Subject: tracing: Support __rel_loc relative dynamic data location attribute Add '__rel_loc' new dynamic data location attribute which encodes the data location from the next to the field itself. The '__data_loc' is used for encoding the dynamic data location on the trace event record. But '__data_loc' is not useful if the writer doesn't know the event header (e.g. user event), because it records the dynamic data offset from the entry of the record, not the field itself. This new '__rel_loc' attribute encodes the data location relatively from the next of the field. For example, when there is a record like below (the number in the parentheses is the size of fields) |header(N)|common(M)|fields(K)|__data_loc(4)|fields(L)|data(G)| In this case, '__data_loc' field will be __data_loc = (G << 16) | (N+M+K+4+L) If '__rel_loc' is used, this will be |header(N)|common(M)|fields(K)|__rel_loc(4)|fields(L)|data(G)| where __rel_loc = (G << 16) | (L) This case shows L bytes after the '__rel_loc' attribute field, if there is no fields after the __rel_loc field, L must be 0. This is relatively easy (and no need to consider the kernel header change) when the event data fields are composed by user who doesn't know header and common fields. Link: https://lkml.kernel.org/r/163757341258.510314.4214431827833229956.stgit@devnote2 Cc: Beau Belgrave Cc: Namhyung Kim Cc: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 2d167ac3452c..3900404aa063 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -782,6 +782,7 @@ enum { FILTER_OTHER = 0, FILTER_STATIC_STRING, FILTER_DYN_STRING, + FILTER_RDYN_STRING, FILTER_PTR_STRING, FILTER_TRACE_FN, FILTER_COMM, -- cgit v1.2.3 From 8c33915d77a565b8b5d44e6368e22b6ea300b7a8 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 28 Nov 2021 20:00:29 +0100 Subject: platform/x86: wmi: Add no_notify_data flag to struct wmi_driver Some WMI implementations do notifies on WMI objects without a _WED method allow WMI drivers to indicate that _WED should not be called for notifies on the WMI objects the driver is bound to. Instead the driver's notify callback will simply be called with a NULL data argument. Reported-by: Yauhen Kharuzhy Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20211128190031.405620-3-hdegoede@redhat.com --- include/linux/wmi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 2cb3913c1f50..b88d7b58e61e 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -35,6 +35,7 @@ extern int set_required_buffer_size(struct wmi_device *wdev, u64 length); struct wmi_driver { struct device_driver driver; const struct wmi_device_id *id_table; + bool no_notify_data; int (*probe)(struct wmi_device *wdev, const void *context); void (*remove)(struct wmi_device *wdev); -- cgit v1.2.3 From 4e66934eaadc83b27ada8d42b60894018f3bfabf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:21:55 -0800 Subject: lib: add reference counting tracking infrastructure It can be hard to track where references are taken and released. In networking, we have annoying issues at device or netns dismantles, and we had various proposals to ease root causing them. This patch adds new infrastructure pairing refcount increases and decreases. This will self document code, because programmers will have to associate increments/decrements. This is controled by CONFIG_REF_TRACKER which can be selected by users of this feature. This adds both cpu and memory costs, and thus should probably be used with care. Signed-off-by: Eric Dumazet Reviewed-by: Dmitry Vyukov Signed-off-by: Jakub Kicinski --- include/linux/ref_tracker.h | 73 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 include/linux/ref_tracker.h (limited to 'include/linux') diff --git a/include/linux/ref_tracker.h b/include/linux/ref_tracker.h new file mode 100644 index 000000000000..c11c9db5825c --- /dev/null +++ b/include/linux/ref_tracker.h @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#ifndef _LINUX_REF_TRACKER_H +#define _LINUX_REF_TRACKER_H +#include +#include +#include + +struct ref_tracker; + +struct ref_tracker_dir { +#ifdef CONFIG_REF_TRACKER + spinlock_t lock; + unsigned int quarantine_avail; + refcount_t untracked; + struct list_head list; /* List of active trackers */ + struct list_head quarantine; /* List of dead trackers */ +#endif +}; + +#ifdef CONFIG_REF_TRACKER +static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir, + unsigned int quarantine_count) +{ + INIT_LIST_HEAD(&dir->list); + INIT_LIST_HEAD(&dir->quarantine); + spin_lock_init(&dir->lock); + dir->quarantine_avail = quarantine_count; + refcount_set(&dir->untracked, 1); +} + +void ref_tracker_dir_exit(struct ref_tracker_dir *dir); + +void ref_tracker_dir_print(struct ref_tracker_dir *dir, + unsigned int display_limit); + +int ref_tracker_alloc(struct ref_tracker_dir *dir, + struct ref_tracker **trackerp, gfp_t gfp); + +int ref_tracker_free(struct ref_tracker_dir *dir, + struct ref_tracker **trackerp); + +#else /* CONFIG_REF_TRACKER */ + +static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir, + unsigned int quarantine_count) +{ +} + +static inline void ref_tracker_dir_exit(struct ref_tracker_dir *dir) +{ +} + +static inline void ref_tracker_dir_print(struct ref_tracker_dir *dir, + unsigned int display_limit) +{ +} + +static inline int ref_tracker_alloc(struct ref_tracker_dir *dir, + struct ref_tracker **trackerp, + gfp_t gfp) +{ + return 0; +} + +static inline int ref_tracker_free(struct ref_tracker_dir *dir, + struct ref_tracker **trackerp) +{ + return 0; +} + +#endif + +#endif /* _LINUX_REF_TRACKER_H */ -- cgit v1.2.3 From 4d92b95ff2f95f13df9bad0b5a25a9f60e72758d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:21:57 -0800 Subject: net: add net device refcount tracker infrastructure net device are refcounted. Over the years we had numerous bugs caused by imbalanced dev_hold() and dev_put() calls. The general idea is to be able to precisely pair each decrement with a corresponding prior increment. Both share a cookie, basically a pointer to private data storing stack traces. This patch adds dev_hold_track() and dev_put_track(). To use these helpers, each data structure owning a refcount should also use a "netdevice_tracker" to pair the hold and put. netdevice_tracker dev_tracker; ... dev_hold_track(dev, &dev_tracker, GFP_ATOMIC); ... dev_put_track(dev, &dev_tracker); Whenever a leak happens, we will get precise stack traces of the point dev_hold_track() happened, at device dismantle phase. We will also get a stack trace if too many dev_put_track() for the same netdevice_tracker are attempted. This is guarded by CONFIG_NET_DEV_REFCNT_TRACKER option. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 65117f01d5f2..143d60ed0047 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -48,6 +48,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -300,6 +301,12 @@ enum netdev_state_t { }; +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER +typedef struct ref_tracker *netdevice_tracker; +#else +typedef struct {} netdevice_tracker; +#endif + struct gro_list { struct list_head list; int count; @@ -1865,6 +1872,7 @@ enum netdev_ml_priv_type { * @proto_down_reason: reason a netdev interface is held down * @pcpu_refcnt: Number of references to this device * @dev_refcnt: Number of references to this device + * @refcnt_tracker: Tracker directory for tracked references to this device * @todo_list: Delayed register/unregister * @link_watch_list: XXX: need comments on this one * @@ -2178,6 +2186,7 @@ struct net_device { #else refcount_t dev_refcnt; #endif + struct ref_tracker_dir refcnt_tracker; struct list_head link_watch_list; @@ -3805,6 +3814,7 @@ void netdev_run_todo(void); * @dev: network device * * Release reference to device to allow it to be freed. + * Try using dev_put_track() instead. */ static inline void dev_put(struct net_device *dev) { @@ -3822,6 +3832,7 @@ static inline void dev_put(struct net_device *dev) * @dev: network device * * Hold reference to device to keep it from being freed. + * Try using dev_hold_track() instead. */ static inline void dev_hold(struct net_device *dev) { @@ -3834,6 +3845,40 @@ static inline void dev_hold(struct net_device *dev) } } +static inline void netdev_tracker_alloc(struct net_device *dev, + netdevice_tracker *tracker, gfp_t gfp) +{ +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + ref_tracker_alloc(&dev->refcnt_tracker, tracker, gfp); +#endif +} + +static inline void netdev_tracker_free(struct net_device *dev, + netdevice_tracker *tracker) +{ +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + ref_tracker_free(&dev->refcnt_tracker, tracker); +#endif +} + +static inline void dev_hold_track(struct net_device *dev, + netdevice_tracker *tracker, gfp_t gfp) +{ + if (dev) { + dev_hold(dev); + netdev_tracker_alloc(dev, tracker, gfp); + } +} + +static inline void dev_put_track(struct net_device *dev, + netdevice_tracker *tracker) +{ + if (dev) { + netdev_tracker_free(dev, tracker); + dev_put(dev); + } +} + /* Carrier loss detection, dial on demand. The functions netif_carrier_on * and _off may be called from IRQ context, but it is caller * who is responsible for serialization of these calls. -- cgit v1.2.3 From 80e8921b2b72c300ca56a01729004d30bedb82cd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:21:58 -0800 Subject: net: add net device refcount tracker to struct netdev_rx_queue This helps debugging net device refcount leaks. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 143d60ed0047..3d691fadd569 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -741,6 +741,8 @@ struct netdev_rx_queue { #endif struct kobject kobj; struct net_device *dev; + netdevice_tracker dev_tracker; + #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif -- cgit v1.2.3 From 0b688f24b7d611db3a02f3d4ab562d049c78a17d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:21:59 -0800 Subject: net: add net device refcount tracker to struct netdev_queue This will help debugging pesky netdev reference leaks. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3d691fadd569..b4f704337f65 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -586,6 +586,8 @@ struct netdev_queue { * read-mostly part */ struct net_device *dev; + netdevice_tracker dev_tracker; + struct Qdisc __rcu *qdisc; struct Qdisc *qdisc_sleeping; #ifdef CONFIG_SYSFS -- cgit v1.2.3 From 9038c320001dd07f60736018edf608ac5baca0ab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:22:03 -0800 Subject: net: dst: add net device refcount tracking to dst_entry We want to track all dev_hold()/dev_put() to ease leak hunting. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b4f704337f65..afed3b10491b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3883,6 +3883,23 @@ static inline void dev_put_track(struct net_device *dev, } } +static inline void dev_replace_track(struct net_device *odev, + struct net_device *ndev, + netdevice_tracker *tracker, + gfp_t gfp) +{ +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + if (odev) + ref_tracker_free(&odev->refcnt_tracker, tracker); +#endif + dev_hold(ndev); + dev_put(odev); +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + if (ndev) + ref_tracker_alloc(&ndev->refcnt_tracker, tracker, gfp); +#endif +} + /* Carrier loss detection, dial on demand. The functions netif_carrier_on * and _off may be called from IRQ context, but it is caller * who is responsible for serialization of these calls. -- cgit v1.2.3 From c04438f58d140723e58050fcb9d33d84cb39e9e9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:22:12 -0800 Subject: ipv4: add net device refcount tracker to struct in_device Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/inetdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 518b484a7f07..674aeead6260 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -24,6 +24,8 @@ struct ipv4_devconf { struct in_device { struct net_device *dev; + netdevice_tracker dev_tracker; + refcount_t refcnt; int dead; struct in_ifaddr __rcu *ifa_list;/* IP ifaddr chain */ -- cgit v1.2.3 From 63f13937cbe9b00982dfc8e578b1aec8e5037333 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:22:14 -0800 Subject: net: linkwatch: add net device refcount tracker Add a netdevice_tracker inside struct net_device, to track the self reference when a device is in lweventlist. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index afed3b10491b..69dca1edd5a6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1950,6 +1950,7 @@ enum netdev_ml_priv_type { * keep a list of interfaces to be deleted. * * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. + * @linkwatch_dev_tracker: refcount tracker used by linkwatch. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2280,6 +2281,7 @@ struct net_device { struct bpf_xdp_entity xdp_state[__MAX_XDP_MODE]; u8 dev_addr_shadow[MAX_ADDR_LEN]; + netdevice_tracker linkwatch_dev_tracker; }; #define to_net_dev(d) container_of(d, struct net_device, dev) -- cgit v1.2.3 From 42120a86438379eb77424831ae3d696c2d5cb622 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:22:16 -0800 Subject: ipmr, ip6mr: add net device refcount tracker to struct vif_device Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 8071148f29a6..e05ee9f001ff 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -12,6 +12,7 @@ /** * struct vif_device - interface representor for multicast routing * @dev: network device being used + * @dev_tracker: refcount tracker for @dev reference * @bytes_in: statistic; bytes ingressing * @bytes_out: statistic; bytes egresing * @pkt_in: statistic; packets ingressing @@ -26,6 +27,7 @@ */ struct vif_device { struct net_device *dev; + netdevice_tracker dev_tracker; unsigned long bytes_in, bytes_out; unsigned long pkt_in, pkt_out; unsigned long rate_limit; -- cgit v1.2.3 From 5fa5ae605821e0e10ee489d9a6e331fd287ccc57 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:22:17 -0800 Subject: netpoll: add net device refcount tracker to struct netpoll Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netpoll.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index e6a2d72e0dc7..bd19c4b91e31 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -24,6 +24,7 @@ union inet_addr { struct netpoll { struct net_device *dev; + netdevice_tracker dev_tracker; char dev_name[IFNAMSIZ]; const char *name; -- cgit v1.2.3 From 45cac6754529ae17345d8f5b632d9e602a091a20 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Dec 2021 20:53:56 -0800 Subject: net: fix recent csum changes Vladimir reported csum issues after my recent change in skb_postpull_rcsum() Issue here is the following: initial skb->csum is the csum of [part to be pulled][rest of packet] Old code: skb->csum = csum_sub(skb->csum, csum_partial(pull, pull_length, 0)); New code: skb->csum = ~csum_partial(pull, pull_length, ~skb->csum); This is broken if the csum of [pulled part] happens to be equal to skb->csum, because end result of skb->csum is 0 in new code, instead of being 0xffffffff David Laight suggested to use skb->csum = -csum_partial(pull, pull_length, -skb->csum); I based my patches on existing code present in include/net/seg6.h, update_csum_diff4() and update_csum_diff16() which might need a similar fix. I guess that my tests, mostly pulling 40 bytes of IPv6 header were not providing enough entropy to hit this bug. v2: added wsum_negate() to make sparse happy. Fixes: 29c3002644bd ("net: optimize skb_postpull_rcsum()") Fixes: 0bd28476f636 ("gro: optimize skb_gro_postpull_rcsum()") Signed-off-by: Eric Dumazet Reported-by: Vladimir Oltean Suggested-by: David Laight Cc: David Lebrun Tested-by: Vladimir Oltean Link: https://lore.kernel.org/r/20211204045356.3659278-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index eae4bd3237a4..dd262bd8ddbe 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3486,7 +3486,8 @@ static inline void skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = ~csum_partial(start, len, ~skb->csum); + skb->csum = wsum_negate(csum_partial(start, len, + wsum_negate(skb->csum))); else if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start_offset(skb) < 0) skb->ip_summed = CHECKSUM_NONE; -- cgit v1.2.3 From f0e6e6fa41b3d2aa1dcb61dd4ed6d7be004bb5a8 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 18 Oct 2021 17:14:07 +0200 Subject: KVM: Drop stale kvm_is_transparent_hugepage() declaration kvm_is_transparent_hugepage() was removed in commit 205d76ff0684 ("KVM: Remove kvm_is_transparent_hugepage() and PageTransCompoundMap()") but its declaration in include/linux/kvm_host.h persisted. Drop it. Fixes: 205d76ff0684 (""KVM: Remove kvm_is_transparent_hugepage() and PageTransCompoundMap()") Signed-off-by: Vitaly Kuznetsov Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20211018151407.2107363-1-vkuznets@redhat.com --- include/linux/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c310648cc8f1..6d138adc78af 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1174,7 +1174,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); bool kvm_is_reserved_pfn(kvm_pfn_t pfn); bool kvm_is_zone_device_pfn(kvm_pfn_t pfn); -bool kvm_is_transparent_hugepage(kvm_pfn_t pfn); struct kvm_irq_ack_notifier { struct hlist_node link; -- cgit v1.2.3 From 77993b595ada5731e513eb06a0f4bf4b9f1e9532 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 29 Nov 2021 18:46:54 +0100 Subject: locking: Allow to include asm/spinlock_types.h from linux/spinlock_types_raw.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The printk header file includes ratelimit_types.h for its __ratelimit() based usage. It is required for the static initializer used in printk_ratelimited(). It uses a raw_spinlock_t and includes the spinlock_types.h. PREEMPT_RT substitutes spinlock_t with a rtmutex based implementation and so its spinlock_t implmentation (provided by spinlock_rt.h) includes rtmutex.h and atomic.h which leads to recursive includes where defines are missing. By including only the raw_spinlock_t defines it avoids the atomic.h related includes at this stage. An example on powerpc: | CALL scripts/atomic/check-atomics.sh |In file included from include/linux/bug.h:5, | from include/linux/page-flags.h:10, | from kernel/bounds.c:10: |arch/powerpc/include/asm/page_32.h: In function ‘clear_page’: |arch/powerpc/include/asm/bug.h:87:4: error: implicit declaration of function â=80=98__WARNâ=80=99 [-Werror=3Dimplicit-function-declaration] | 87 | __WARN(); \ | | ^~~~~~ |arch/powerpc/include/asm/page_32.h:48:2: note: in expansion of macro ‘WARN_ONâ€=99 | 48 | WARN_ON((unsigned long)addr & (L1_CACHE_BYTES - 1)); | | ^~~~~~~ |arch/powerpc/include/asm/bug.h:58:17: error: invalid application of ‘sizeofâ€=99 to incomplete type ‘struct bug_entryâ€=99 | 58 | "i" (sizeof(struct bug_entry)), \ | | ^~~~~~ |arch/powerpc/include/asm/bug.h:89:3: note: in expansion of macro ‘BUG_ENTRYâ€=99 | 89 | BUG_ENTRY(PPC_TLNEI " %4, 0", \ | | ^~~~~~~~~ |arch/powerpc/include/asm/page_32.h:48:2: note: in expansion of macro ‘WARN_ONâ€=99 | 48 | WARN_ON((unsigned long)addr & (L1_CACHE_BYTES - 1)); | | ^~~~~~~ |In file included from arch/powerpc/include/asm/ptrace.h:298, | from arch/powerpc/include/asm/hw_irq.h:12, | from arch/powerpc/include/asm/irqflags.h:12, | from include/linux/irqflags.h:16, | from include/asm-generic/cmpxchg-local.h:6, | from arch/powerpc/include/asm/cmpxchg.h:526, | from arch/powerpc/include/asm/atomic.h:11, | from include/linux/atomic.h:7, | from include/linux/rwbase_rt.h:6, | from include/linux/rwlock_types.h:55, | from include/linux/spinlock_types.h:74, | from include/linux/ratelimit_types.h:7, | from include/linux/printk.h:10, | from include/asm-generic/bug.h:22, | from arch/powerpc/include/asm/bug.h:109, | from include/linux/bug.h:5, | from include/linux/page-flags.h:10, | from kernel/bounds.c:10: |include/linux/thread_info.h: In function â=80=98copy_overflowâ=80=99: |include/linux/thread_info.h:210:2: error: implicit declaration of function â=80=98WARNâ=80=99 [-Werror=3Dimplicit-function-declaration] | 210 | WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); | | ^~~~ The WARN / BUG include pulls in printk.h and then ptrace.h expects WARN (from bug.h) which is not yet complete. Even hw_irq.h has WARN_ON() statements. On POWERPC64 there are missing atomic64 defines while building 32bit VDSO: | VDSO32C arch/powerpc/kernel/vdso32/vgettimeofday.o |In file included from include/linux/atomic.h:80, | from include/linux/rwbase_rt.h:6, | from include/linux/rwlock_types.h:55, | from include/linux/spinlock_types.h:74, | from include/linux/ratelimit_types.h:7, | from include/linux/printk.h:10, | from include/linux/kernel.h:19, | from arch/powerpc/include/asm/page.h:11, | from arch/powerpc/include/asm/vdso/gettimeofday.h:5, | from include/vdso/datapage.h:137, | from lib/vdso/gettimeofday.c:5, | from : |include/linux/atomic-arch-fallback.h: In function ‘arch_atomic64_incâ€=99: |include/linux/atomic-arch-fallback.h:1447:2: error: implicit declaration of function ‘arch_atomic64_add’; did you mean ‘arch_atomic_add’? [-Werror=3Dimpl |icit-function-declaration] | 1447 | arch_atomic64_add(1, v); | | ^~~~~~~~~~~~~~~~~ | | arch_atomic_add The generic fallback is not included, atomics itself are not used. If kernel.h does not include printk.h then it comes later from the bug.h include. Allow asm/spinlock_types.h to be included from linux/spinlock_types_raw.h. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211129174654.668506-12-bigeasy@linutronix.de --- include/linux/ratelimit_types.h | 2 +- include/linux/spinlock_types_up.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ratelimit_types.h b/include/linux/ratelimit_types.h index b676aa419eef..c21c7f8103e2 100644 --- a/include/linux/ratelimit_types.h +++ b/include/linux/ratelimit_types.h @@ -4,7 +4,7 @@ #include #include -#include +#include #define DEFAULT_RATELIMIT_INTERVAL (5 * HZ) #define DEFAULT_RATELIMIT_BURST 10 diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h index c09b6407ae1b..7f86a2016ac5 100644 --- a/include/linux/spinlock_types_up.h +++ b/include/linux/spinlock_types_up.h @@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_TYPES_UP_H #define __LINUX_SPINLOCK_TYPES_UP_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif -- cgit v1.2.3 From 8d9f738f16a3ee9f2341578873c542ddd9802fe4 Mon Sep 17 00:00:00 2001 From: Yanteng Si Date: Tue, 7 Dec 2021 20:32:30 +0800 Subject: regulator: fix bullet lists of regulator_ops comment Since 89a6a5e56c82("regulator: add property parsing and callbacks to set protection limits") which introduced a warning: Documentation/driver-api/regulator:166: ./include/linux/regulator/driver.h:96: WARNING: Unexpected indentation. Documentation/driver-api/regulator:166: ./include/linux/regulator/driver.h:98: WARNING: Block quote ends without a blank line; unexpected unindent. Let's fix them. Signed-off-by: Yanteng Si Link: https://lore.kernel.org/r/20211207123230.2262047-1-siyanteng@loongson.cn Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index 4078c7776453..720684995a77 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -90,15 +90,19 @@ enum regulator_detection_severity { * @set_over_current_protection: Support enabling of and setting limits for over * current situation detection. Detection can be configured for three * levels of severity. - * REGULATOR_SEVERITY_PROT should automatically shut down the regulator(s). - * REGULATOR_SEVERITY_ERR should indicate that over-current situation is - * caused by an unrecoverable error but HW does not perform - * automatic shut down. - * REGULATOR_SEVERITY_WARN should indicate situation where hardware is - * still believed to not be damaged but that a board sepcific - * recovery action is needed. If lim_uA is 0 the limit should not - * be changed but the detection should just be enabled/disabled as - * is requested. + * + * - REGULATOR_SEVERITY_PROT should automatically shut down the regulator(s). + * + * - REGULATOR_SEVERITY_ERR should indicate that over-current situation is + * caused by an unrecoverable error but HW does not perform + * automatic shut down. + * + * - REGULATOR_SEVERITY_WARN should indicate situation where hardware is + * still believed to not be damaged but that a board sepcific + * recovery action is needed. If lim_uA is 0 the limit should not + * be changed but the detection should just be enabled/disabled as + * is requested. + * * @set_over_voltage_protection: Support enabling of and setting limits for over * voltage situation detection. Detection can be configured for same * severities as over current protection. Units of uV. -- cgit v1.2.3 From 13244cccc2b61ec715f0ac583d3037497004d4a5 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 1 Dec 2021 10:54:52 -0800 Subject: skbuff: introduce skb_pull_data Like skb_pull but returns the original data pointer before pulling the data after performing a check against sbk->len. This allows to change code that does "struct foo *p = (void *)skb->data;" which is hard to audit and error prone, to: p = skb_pull_data(skb, sizeof(*p)); if (!p) return; Which is both safer and cleaner. Acked-by: Jakub Kicinski Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Dan Carpenter Signed-off-by: Marcel Holtmann --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index eba256af64a5..877dda38684a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2373,6 +2373,8 @@ static inline void *skb_pull_inline(struct sk_buff *skb, unsigned int len) return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len); } +void *skb_pull_data(struct sk_buff *skb, size_t len); + void *__pskb_pull_tail(struct sk_buff *skb, int delta); static inline void *__pskb_pull(struct sk_buff *skb, unsigned int len) -- cgit v1.2.3 From 6fadaa565882cd7afc501de5921db6f5e45c784b Mon Sep 17 00:00:00 2001 From: Maxim Galaganov Date: Fri, 3 Dec 2021 14:35:39 -0800 Subject: tcp: expose __tcp_sock_set_cork and __tcp_sock_set_nodelay Expose __tcp_sock_set_cork() and __tcp_sock_set_nodelay() for use in MPTCP setsockopt code -- namely for syncing MPTCP socket options with subflows inside sync_socket_options() while already holding the subflow socket lock. Acked-by: Paolo Abeni Acked-by: Matthieu Baerts Signed-off-by: Maxim Galaganov Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 48d8a363319e..78b91bb92f0d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -512,11 +512,13 @@ static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss) int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); +void __tcp_sock_set_cork(struct sock *sk, bool on); void tcp_sock_set_cork(struct sock *sk, bool on); int tcp_sock_set_keepcnt(struct sock *sk, int val); int tcp_sock_set_keepidle_locked(struct sock *sk, int val); int tcp_sock_set_keepidle(struct sock *sk, int val); int tcp_sock_set_keepintvl(struct sock *sk, int val); +void __tcp_sock_set_nodelay(struct sock *sk, bool on); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); -- cgit v1.2.3 From 213d56bf33bdda835bac04046f09256a75c5ca8e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Oct 2021 02:08:07 +0200 Subject: rcu/nocb: Prepare state machine for a new step Currently SEGCBLIST_SOFTIRQ_ONLY is a bit of an exception among the segcblist flags because it is an exclusive state that doesn't mix up with the other flags. Remove it in favour of: _ A flag specifying that rcu_core() needs to perform callbacks execution and acceleration and _ A flag specifying we want the nocb lock to be held in any needed circumstances This clarifies the code and is more flexible: It allows to have a state where rcu_core() runs with locking while offloading hasn't started yet. This is a necessary step to prepare for triggering rcu_core() at the very beginning of the de-offloading process so that rcu_core() won't dismiss work while being preempted by the de-offloading process, at least not without a pending subsequent rcu_core() that will quickly catch up. Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Tested-by: Sebastian Andrzej Siewior Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Josh Triplett Cc: Joel Fernandes Cc: Boqun Feng Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 3db96c4f45fd..812961b1d064 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -69,7 +69,7 @@ struct rcu_cblist { * * * ---------------------------------------------------------------------------- - * | SEGCBLIST_SOFTIRQ_ONLY | + * | SEGCBLIST_RCU_CORE | * | | * | Callbacks processed by rcu_core() from softirqs or local | * | rcuc kthread, without holding nocb_lock. | @@ -77,7 +77,7 @@ struct rcu_cblist { * | * v * ---------------------------------------------------------------------------- - * | SEGCBLIST_OFFLOADED | + * | SEGCBLIST_RCU_CORE | SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED | * | | * | Callbacks processed by rcu_core() from softirqs or local | * | rcuc kthread, while holding nocb_lock. Waking up CB and GP kthreads, | @@ -89,7 +89,9 @@ struct rcu_cblist { * | | * v v * --------------------------------------- ----------------------------------| - * | SEGCBLIST_OFFLOADED | | | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_RCU_CORE | | | SEGCBLIST_RCU_CORE | | + * | SEGCBLIST_LOCKING | | | SEGCBLIST_LOCKING | | + * | SEGCBLIST_OFFLOADED | | | SEGCBLIST_OFFLOADED | | * | SEGCBLIST_KTHREAD_CB | | SEGCBLIST_KTHREAD_GP | * | | | | * | | | | @@ -104,9 +106,10 @@ struct rcu_cblist { * | * v * |--------------------------------------------------------------------------| - * | SEGCBLIST_OFFLOADED | | - * | SEGCBLIST_KTHREAD_CB | | - * | SEGCBLIST_KTHREAD_GP | + * | SEGCBLIST_LOCKING | | + * | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_KTHREAD_GP | | + * | SEGCBLIST_KTHREAD_CB | * | | * | Kthreads handle callbacks holding nocb_lock, local rcu_core() stops | * | handling callbacks. Enable bypass queueing. | @@ -120,7 +123,8 @@ struct rcu_cblist { * * * |--------------------------------------------------------------------------| - * | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_LOCKING | | + * | SEGCBLIST_OFFLOADED | | * | SEGCBLIST_KTHREAD_CB | | * | SEGCBLIST_KTHREAD_GP | * | | @@ -130,6 +134,8 @@ struct rcu_cblist { * | * v * |--------------------------------------------------------------------------| + * | SEGCBLIST_RCU_CORE | | + * | SEGCBLIST_LOCKING | | * | SEGCBLIST_KTHREAD_CB | | * | SEGCBLIST_KTHREAD_GP | * | | @@ -143,7 +149,9 @@ struct rcu_cblist { * | | * v v * ---------------------------------------------------------------------------| - * | | + * | | | + * | SEGCBLIST_RCU_CORE | | SEGCBLIST_RCU_CORE | | + * | SEGCBLIST_LOCKING | | SEGCBLIST_LOCKING | | * | SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | * | | | * | GP kthread woke up and | CB kthread woke up and | @@ -159,7 +167,7 @@ struct rcu_cblist { * | * v * ---------------------------------------------------------------------------- - * | 0 | + * | SEGCBLIST_RCU_CORE | SEGCBLIST_LOCKING | * | | * | Callbacks processed by rcu_core() from softirqs or local | * | rcuc kthread, while holding nocb_lock. Forbid nocb_timer to be armed. | @@ -168,17 +176,18 @@ struct rcu_cblist { * | * v * ---------------------------------------------------------------------------- - * | SEGCBLIST_SOFTIRQ_ONLY | + * | SEGCBLIST_RCU_CORE | * | | * | Callbacks processed by rcu_core() from softirqs or local | * | rcuc kthread, without holding nocb_lock. | * ---------------------------------------------------------------------------- */ #define SEGCBLIST_ENABLED BIT(0) -#define SEGCBLIST_SOFTIRQ_ONLY BIT(1) -#define SEGCBLIST_KTHREAD_CB BIT(2) -#define SEGCBLIST_KTHREAD_GP BIT(3) -#define SEGCBLIST_OFFLOADED BIT(4) +#define SEGCBLIST_RCU_CORE BIT(1) +#define SEGCBLIST_LOCKING BIT(2) +#define SEGCBLIST_KTHREAD_CB BIT(3) +#define SEGCBLIST_KTHREAD_GP BIT(4) +#define SEGCBLIST_OFFLOADED BIT(5) struct rcu_segcblist { struct rcu_head *head; -- cgit v1.2.3 From fbb94cbd70d41c7511460896dfc7f9ea5da704b3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Oct 2021 02:08:08 +0200 Subject: rcu/nocb: Invoke rcu_core() at the start of deoffloading On PREEMPT_RT, if rcu_core() is preempted by the de-offloading process, some work, such as callbacks acceleration and invocation, may be left unattended due to the volatile checks on the offloaded state. In the worst case this work is postponed until the next rcu_pending() check that can take a jiffy to reach, which can be a problem in case of callbacks flooding. Solve that with invoking rcu_core() early in the de-offloading process. This way any work dismissed by an ongoing rcu_core() call fooled by a preempting deoffloading process will be caught up by a nearby future recall to rcu_core(), this time fully aware of the de-offloading state. Tested-by: Valentin Schneider Tested-by: Sebastian Andrzej Siewior Signed-off-by: Frederic Weisbecker Cc: Valentin Schneider Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Josh Triplett Cc: Joel Fernandes Cc: Boqun Feng Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 812961b1d064..659d13a7ddaa 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -136,6 +136,20 @@ struct rcu_cblist { * |--------------------------------------------------------------------------| * | SEGCBLIST_RCU_CORE | | * | SEGCBLIST_LOCKING | | + * | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_KTHREAD_CB | | + * | SEGCBLIST_KTHREAD_GP | + * | | + * | CB/GP kthreads handle callbacks holding nocb_lock, local rcu_core() | + * | handles callbacks concurrently. Bypass enqueue is enabled. | + * | Invoke RCU core so we make sure not to preempt it in the middle with | + * | leaving some urgent work unattended within a jiffy. | + * ---------------------------------------------------------------------------- + * | + * v + * |--------------------------------------------------------------------------| + * | SEGCBLIST_RCU_CORE | | + * | SEGCBLIST_LOCKING | | * | SEGCBLIST_KTHREAD_CB | | * | SEGCBLIST_KTHREAD_GP | * | | -- cgit v1.2.3 From 81faa4f6fba429334ff72bb5ba7696818509b5b5 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Wed, 3 Nov 2021 16:30:28 +0800 Subject: locktorture,rcutorture,torture: Always log error message Unconditionally log messages corresponding to errors. Acked-by: Davidlohr Bueso Signed-off-by: Li Zhijian Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 24f58e50a94b..63fa4196e51c 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -38,13 +38,8 @@ do { \ pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s); \ } \ } while (0) -#define VERBOSE_TOROUT_ERRSTRING(s) \ -do { \ - if (verbose) { \ - verbose_torout_sleep(); \ - pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); \ - } \ -} while (0) +#define TOROUT_ERRSTRING(s) \ + pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s) void verbose_torout_sleep(void); #define torture_init_error(firsterr) \ -- cgit v1.2.3 From 08f0b22d731fa86957749c649d6ef6ebc07e8ad2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Dec 2021 17:30:27 -0800 Subject: net: eql: add net device refcount tracker Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/if_eql.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/if_eql.h b/include/linux/if_eql.h index d984694c384d..d75601d613cc 100644 --- a/include/linux/if_eql.h +++ b/include/linux/if_eql.h @@ -26,6 +26,7 @@ typedef struct slave { struct list_head list; struct net_device *dev; + netdevice_tracker dev_tracker; long priority; long priority_bps; long priority_Bps; -- cgit v1.2.3 From 19c9ebf6ed70856385296a65e78c1699081b152f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Dec 2021 17:30:28 -0800 Subject: vlan: add net device refcount tracker Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/if_vlan.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 41a518336673..8420fe504927 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -162,6 +162,7 @@ struct netpoll; * @vlan_id: VLAN identifier * @flags: device flags * @real_dev: underlying netdevice + * @dev_tracker: refcount tracker for @real_dev reference * @real_dev_addr: address of underlying netdevice * @dent: proc dir entry * @vlan_pcpu_stats: ptr to percpu rx stats @@ -177,6 +178,8 @@ struct vlan_dev_priv { u16 flags; struct net_device *real_dev; + netdevice_tracker dev_tracker; + unsigned char real_dev_addr[ETH_ALEN]; struct proc_dir_entry *dent; -- cgit v1.2.3 From f12bf6f3f942b37de65eeea8be25903587fec930 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Dec 2021 17:30:30 -0800 Subject: net: watchdog: add net device refcount tracker Add a netdevice_tracker inside struct net_device, to track the self reference when a device has an active watchdog timer. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 69dca1edd5a6..1a748ee9a421 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1951,6 +1951,7 @@ enum netdev_ml_priv_type { * * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. * @linkwatch_dev_tracker: refcount tracker used by linkwatch. + * @watchdog_dev_tracker: refcount tracker used by watchdog. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2282,6 +2283,7 @@ struct net_device { u8 dev_addr_shadow[MAX_ADDR_LEN]; netdevice_tracker linkwatch_dev_tracker; + netdevice_tracker watchdog_dev_tracker; }; #define to_net_dev(d) container_of(d, struct net_device, dev) -- cgit v1.2.3 From a97770cc4016c2733bcef9dbe3d5b1ad02d13356 Mon Sep 17 00:00:00 2001 From: Yanteng Si Date: Mon, 6 Dec 2021 16:12:27 +0800 Subject: net: phy: Remove unnecessary indentation in the comments of phy_device Fix warning as: linux-next/Documentation/networking/kapi:122: ./include/linux/phy.h:543: WARNING: Unexpected indentation. linux-next/Documentation/networking/kapi:122: ./include/linux/phy.h:544: WARNING: Block quote ends without a blank line; unexpected unindent. linux-next/Documentation/networking/kapi:122: ./include/linux/phy.h:546: WARNING: Unexpected indentation. Suggested-by: Akira Yokosawa Signed-off-by: Yanteng Si Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 96e43fbb2dd8..cbf03a5f9cf5 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -538,11 +538,12 @@ struct macsec_ops; * @mac_managed_pm: Set true if MAC driver takes of suspending/resuming PHY * @state: State of the PHY for management purposes * @dev_flags: Device-specific flags used by the PHY driver. - * Bits [15:0] are free to use by the PHY driver to communicate - * driver specific behavior. - * Bits [23:16] are currently reserved for future use. - * Bits [31:24] are reserved for defining generic - * PHY driver behavior. + * + * - Bits [15:0] are free to use by the PHY driver to communicate + * driver specific behavior. + * - Bits [23:16] are currently reserved for future use. + * - Bits [31:24] are reserved for defining generic + * PHY driver behavior. * @irq: IRQ number of the PHY's interrupt (-1 if none) * @phy_timer: The timer for handling the state machine * @phylink: Pointer to phylink instance for this PHY -- cgit v1.2.3 From 330c6d3bfa268794bf692165d0f781f1c2d4d83e Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 24 Nov 2021 10:45:36 +0900 Subject: can: bittiming: replace CAN units with the generic ones from linux/units.h In [1], we introduced a set of units in linux/can/bittiming.h. Since then, generic SI prefixes were added to linux/units.h in [2]. Those new prefixes can perfectly replace CAN specific ones. This patch replaces all occurrences of the CAN units with their corresponding prefix (from linux/units) and the unit (as a comment) according to below table. CAN units SI metric prefix (from linux/units) + unit (as a comment) ------------------------------------------------------------------------ CAN_KBPS KILO /* BPS */ CAN_MBPS MEGA /* BPS */ CAM_MHZ MEGA /* Hz */ The definition are then removed from linux/can/bittiming.h [1] commit 1d7750760b70 ("can: bittiming: add CAN_KBPS, CAN_MBPS and CAN_MHZ macros") [2] commit 26471d4a6cf8 ("units: Add SI metric prefix definitions") Link: https://lore.kernel.org/all/20211124014536.782550-1-mailhol.vincent@wanadoo.fr Suggested-by: Jimmy Assarsson Suggested-by: Oliver Hartkopp Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- include/linux/can/bittiming.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index 20b50baf3a02..a81652d1c6f3 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -12,13 +12,6 @@ #define CAN_SYNC_SEG 1 -/* Kilobits and Megabits per second */ -#define CAN_KBPS 1000UL -#define CAN_MBPS 1000000UL - -/* Megahertz */ -#define CAN_MHZ 1000000UL - #define CAN_CTRLMODE_TDC_MASK \ (CAN_CTRLMODE_TDC_AUTO | CAN_CTRLMODE_TDC_MANUAL) -- cgit v1.2.3 From 27592ae8dbe41033261b6fdf27d78998aabd2665 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 16 Nov 2021 16:03:57 +0000 Subject: KVM: Move wiping of the kvm->vcpus array to common code All architectures have similar loops iterating over the vcpus, freeing one vcpu at a time, and eventually wiping the reference off the vcpus array. They are also inconsistently taking the kvm->lock mutex when wiping the references from the array. Make this code common, which will simplify further changes. The locking is dropped altogether, as this should only be called when there is no further references on the kvm structure. Reviewed-by: Claudio Imbrenda Signed-off-by: Marc Zyngier Message-Id: <20211116160403.4074052-2-maz@kernel.org> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c310648cc8f1..e2f9f8f67c58 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -733,7 +733,7 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) if (WARN_ON_ONCE(!memslot->npages)) { \ } else -void kvm_vcpu_destroy(struct kvm_vcpu *vcpu); +void kvm_destroy_vcpus(struct kvm *kvm); void vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From c5b077549136584618a66258f09d8d4b41e7409c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 16 Nov 2021 16:04:01 +0000 Subject: KVM: Convert the kvm->vcpus array to a xarray At least on arm64 and x86, the vcpus array is pretty huge (up to 1024 entries on x86) and is mostly empty in the majority of the cases (running 1k vcpu VMs is not that common). This mean that we end-up with a 4kB block of unused memory in the middle of the kvm structure. Instead of wasting away this memory, let's use an xarray instead, which gives us almost the same flexibility as a normal array, but with a reduced memory usage with smaller VMs. Signed-off-by: Marc Zyngier Message-Id: <20211116160403.4074052-6-maz@kernel.org> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e2f9f8f67c58..2201dc07126a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -552,7 +553,7 @@ struct kvm { struct mutex slots_arch_lock; struct mm_struct *mm; /* userspace tied to this vm */ struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; - struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + struct xarray vcpu_array; /* Used to wait for completion of MMU notifiers. */ spinlock_t mn_invalidate_lock; @@ -701,7 +702,7 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu. */ smp_rmb(); - return kvm->vcpus[i]; + return xa_load(&kvm->vcpu_array, i); } #define kvm_for_each_vcpu(idx, vcpup, kvm) \ -- cgit v1.2.3 From 46808a4cb89708c2e5b264eb9d1035762581921b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 16 Nov 2021 16:04:02 +0000 Subject: KVM: Use 'unsigned long' as kvm_for_each_vcpu()'s index Everywhere we use kvm_for_each_vpcu(), we use an int as the vcpu index. Unfortunately, we're about to move rework the iterator, which requires this to be upgrade to an unsigned long. Let's bite the bullet and repaint all of it in one go. Signed-off-by: Marc Zyngier Message-Id: <20211116160403.4074052-7-maz@kernel.org> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2201dc07126a..7da6086262c6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -714,7 +714,7 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) { struct kvm_vcpu *vcpu = NULL; - int i; + unsigned long i; if (id < 0) return NULL; -- cgit v1.2.3 From 214bd3a6f46981b7867946e1b4f628a06bcf2091 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 16 Nov 2021 16:04:03 +0000 Subject: KVM: Convert kvm_for_each_vcpu() to using xa_for_each_range() Now that the vcpu array is backed by an xarray, use the optimised iterator that matches the underlying data structure. Suggested-by: Sean Christopherson Signed-off-by: Marc Zyngier Message-Id: <20211116160403.4074052-8-maz@kernel.org> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7da6086262c6..66548287ed42 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -705,11 +705,9 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) return xa_load(&kvm->vcpu_array, i); } -#define kvm_for_each_vcpu(idx, vcpup, kvm) \ - for (idx = 0; \ - idx < atomic_read(&kvm->online_vcpus) && \ - (vcpup = kvm_get_vcpu(kvm, idx)) != NULL; \ - idx++) +#define kvm_for_each_vcpu(idx, vcpup, kvm) \ + xa_for_each_range(&kvm->vcpu_array, idx, vcpup, 0, \ + (atomic_read(&kvm->online_vcpus) - 1)) static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) { -- cgit v1.2.3 From afa319a54a8c760ba59683cd3c4318635049a664 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 6 Dec 2021 20:54:07 +0100 Subject: KVM: Require total number of memslot pages to fit in an unsigned long Explicitly disallow creating more memslot pages than can fit in an unsigned long, KVM doesn't correctly handle a total number of memslot pages that doesn't fit in an unsigned long and remedying that would be a waste of time. For a 64-bit kernel, this is a nop as memslots are not allowed to overlap in the gfn address space. With a 32-bit kernel, userspace can at most address 3gb of virtual memory, whereas wrapping the total number of pages would require 4tb+ of guest physical memory. Even with x86's second address space for SMM, userspace would need to alias all of guest memory more than one _thousand_ times. And on older x86 hardware with MAXPHYADDR < 43, the guest couldn't actually access any of those aliases even if userspace lied about guest.MAXPHYADDR. On 390 and arm64, this is a nop as they don't support 32-bit hosts. On x86, practically speaking this is simply acknowledging reality as the existing kvm_mmu_calculate_default_mmu_pages() assumes the total number of pages fits in an "unsigned long". On PPC, this is likely a nop as every flavor of PPC KVM assumes gfns (and gpas!) fit in unsigned long. arch/powerpc/kvm/book3s_32_mmu_host.c goes a step further and fails the build if CONFIG_PTE_64BIT=y, which presumably means that it does't support 64-bit physical addresses. On MIPS, this is also likely a nop as the core MMU helpers assume gpas fit in unsigned long, e.g. see kvm_mips_##name##_pte. And finally, RISC-V is a "don't care" as it doesn't exist in any release, i.e. there is no established ABI to break. Signed-off-by: Sean Christopherson Reviewed-by: Maciej S. Szmigiero Signed-off-by: Maciej S. Szmigiero Message-Id: <1c2c91baf8e78acccd4dad38da591002e61c013c.1638817638.git.maciej.szmigiero@oracle.com> --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 66548287ed42..e38705359af5 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -552,6 +552,7 @@ struct kvm { */ struct mutex slots_arch_lock; struct mm_struct *mm; /* userspace tied to this vm */ + unsigned long nr_memslot_pages; struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; struct xarray vcpu_array; -- cgit v1.2.3 From 537a17b3149300987456e8949ccb991e604047d6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 6 Dec 2021 20:54:11 +0100 Subject: KVM: Let/force architectures to deal with arch specific memslot data Pass the "old" slot to kvm_arch_prepare_memory_region() and force arch code to handle propagating arch specific data from "new" to "old" when necessary. This is a baby step towards dynamically allocating "new" from the get go, and is a (very) minor performance boost on x86 due to not unnecessarily copying arch data. For PPC HV, copy the rmap in the !CREATE and !DELETE paths, i.e. for MOVE and FLAGS_ONLY. This is functionally a nop as the previous behavior would overwrite the pointer for CREATE, and eventually discard/ignore it for DELETE. For x86, copy the arch data only for FLAGS_ONLY changes. Unlike PPC HV, x86 needs to reallocate arch data in the MOVE case as the size of x86's allocations depend on the alignment of the memslot's gfn. Opportunistically tweak kvm_arch_prepare_memory_region()'s param order to match the "commit" prototype. Signed-off-by: Sean Christopherson Reviewed-by: Maciej S. Szmigiero [mss: add missing RISCV kvm_arch_prepare_memory_region() change] Signed-off-by: Maciej S. Szmigiero Message-Id: <67dea5f11bbcfd71e3da5986f11e87f5dd4013f9.1638817639.git.maciej.szmigiero@oracle.com> --- include/linux/kvm_host.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e38705359af5..cb7311dc6f32 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -833,8 +833,9 @@ int __kvm_set_memory_region(struct kvm *kvm, void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, const struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + struct kvm_memory_slot *new, enum kvm_mr_change change); void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, -- cgit v1.2.3 From 6a99c6e3f52a6f0d4c6ebcfa7359c718a19ffbe6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 6 Dec 2021 20:54:18 +0100 Subject: KVM: Stop passing kvm_userspace_memory_region to arch memslot hooks Drop the @mem param from kvm_arch_{prepare,commit}_memory_region() now that its use has been removed in all architectures. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maciej S. Szmigiero Signed-off-by: Maciej S. Szmigiero Message-Id: --- include/linux/kvm_host.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index cb7311dc6f32..da0d4f21a150 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -833,12 +833,10 @@ int __kvm_set_memory_region(struct kvm *kvm, void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, struct kvm_memory_slot *new, enum kvm_mr_change change); void kvm_arch_commit_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem, struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change); -- cgit v1.2.3 From c928bfc2632fa3dd6a3bd4504ac6d8e42302287a Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 6 Dec 2021 20:54:25 +0100 Subject: KVM: Integrate gfn_to_memslot_approx() into search_memslots() s390 arch has gfn_to_memslot_approx() which is almost identical to search_memslots(), differing only in that in case the gfn falls in a hole one of the memslots bordering the hole is returned. Add this lookup mode as an option to search_memslots() so we don't have two almost identical functions for looking up a memslot by its gfn. Signed-off-by: Maciej S. Szmigiero [sean: tweaked helper names to keep gfn_to_memslot_approx() in s390] Reviewed-by: Sean Christopherson Message-Id: <171cd89b52c718dbe180ecd909b4437a64a7e2ec.1638817640.git.maciej.szmigiero@oracle.com> --- include/linux/kvm_host.h | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index da0d4f21a150..2f80ce84fbcf 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1233,10 +1233,14 @@ try_get_memslot(struct kvm_memslots *slots, int slot_index, gfn_t gfn) * Returns a pointer to the memslot that contains gfn and records the index of * the slot in index. Otherwise returns NULL. * + * With "approx" set returns the memslot also when the address falls + * in a hole. In that case one of the memslots bordering the hole is + * returned. + * * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! */ static inline struct kvm_memory_slot * -search_memslots(struct kvm_memslots *slots, gfn_t gfn, int *index) +search_memslots(struct kvm_memslots *slots, gfn_t gfn, int *index, bool approx) { int start = 0, end = slots->used_slots; struct kvm_memory_slot *memslots = slots->memslots; @@ -1254,22 +1258,26 @@ search_memslots(struct kvm_memslots *slots, gfn_t gfn, int *index) start = slot + 1; } + if (approx && start >= slots->used_slots) { + *index = slots->used_slots - 1; + return &memslots[slots->used_slots - 1]; + } + slot = try_get_memslot(slots, start, gfn); if (slot) { *index = start; return slot; } + if (approx) { + *index = start; + return &memslots[start]; + } return NULL; } -/* - * __gfn_to_memslot() and its descendants are here because it is called from - * non-modular code in arch/powerpc/kvm/book3s_64_vio{,_hv}.c. gfn_to_memslot() - * itself isn't here as an inline because that would bloat other code too much. - */ static inline struct kvm_memory_slot * -__gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) +____gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn, bool approx) { struct kvm_memory_slot *slot; int slot_index = atomic_read(&slots->last_used_slot); @@ -1278,7 +1286,7 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) if (slot) return slot; - slot = search_memslots(slots, gfn, &slot_index); + slot = search_memslots(slots, gfn, &slot_index, approx); if (slot) { atomic_set(&slots->last_used_slot, slot_index); return slot; @@ -1287,6 +1295,17 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) return NULL; } +/* + * __gfn_to_memslot() and its descendants are here to allow arch code to inline + * the lookups in hot paths. gfn_to_memslot() itself isn't here as an inline + * because that would bloat other code too much. + */ +static inline struct kvm_memory_slot * +__gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) +{ + return ____gfn_to_memslot(slots, gfn, false); +} + static inline unsigned long __gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { -- cgit v1.2.3 From 26b8345abc75a7404716864710930407b7d873f9 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 6 Dec 2021 20:54:27 +0100 Subject: KVM: Resolve memslot ID via a hash table instead of via a static array Memslot ID to the corresponding memslot mappings are currently kept as indices in static id_to_index array. The size of this array depends on the maximum allowed memslot count (regardless of the number of memslots actually in use). This has become especially problematic recently, when memslot count cap was removed, so the maximum count is now full 32k memslots - the maximum allowed by the current KVM API. Keeping these IDs in a hash table (instead of an array) avoids this problem. Resolving a memslot ID to the actual memslot (instead of its index) will also enable transitioning away from an array-based implementation of the whole memslots structure in a later commit. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Maciej S. Szmigiero Message-Id: <117fb2c04320e6cd6cf34f205a72eadb0aa8d5f9.1638817640.git.maciej.szmigiero@oracle.com> --- include/linux/kvm_host.h | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2f80ce84fbcf..79db70a8323e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -426,6 +427,7 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1) struct kvm_memory_slot { + struct hlist_node id_node; gfn_t base_gfn; unsigned long npages; unsigned long *dirty_bitmap; @@ -527,8 +529,15 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) */ struct kvm_memslots { u64 generation; - /* The mapping table from slot id to the index in memslots[]. */ - short id_to_index[KVM_MEM_SLOTS_NUM]; + /* + * The mapping table from slot id to the index in memslots[]. + * + * 7-bit bucket count matches the size of the old id to index array for + * 512 slots, while giving good performance with this slot count. + * Higher bucket counts bring only small performance improvements but + * always result in higher memory usage (even for lower memslot counts). + */ + DECLARE_HASHTABLE(id_hash, 7); atomic_t last_used_slot; int used_slots; struct kvm_memory_slot memslots[]; @@ -796,16 +805,14 @@ static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu) static inline struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id) { - int index = slots->id_to_index[id]; struct kvm_memory_slot *slot; - if (index < 0) - return NULL; - - slot = &slots->memslots[index]; + hash_for_each_possible(slots->id_hash, slot, id_node, id) { + if (slot->id == id) + return slot; + } - WARN_ON(slot->id != id); - return slot; + return NULL; } /* -- cgit v1.2.3 From ed922739c9199bf515a3e7fec3e319ce1edeef2a Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 6 Dec 2021 20:54:28 +0100 Subject: KVM: Use interval tree to do fast hva lookup in memslots The current memslots implementation only allows quick binary search by gfn, quick lookup by hva is not possible - the implementation has to do a linear scan of the whole memslots array, even though the operation being performed might apply just to a single memslot. This significantly hurts performance of per-hva operations with higher memslot counts. Since hva ranges can overlap between memslots an interval tree is needed for tracking them. [sean: handle interval tree updates in kvm_replace_memslot()] Signed-off-by: Maciej S. Szmigiero Message-Id: --- include/linux/kvm_host.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 79db70a8323e..9552ad6d6652 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -428,6 +429,7 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) struct kvm_memory_slot { struct hlist_node id_node; + struct interval_tree_node hva_node; gfn_t base_gfn; unsigned long npages; unsigned long *dirty_bitmap; @@ -529,6 +531,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) */ struct kvm_memslots { u64 generation; + struct rb_root_cached hva_tree; /* * The mapping table from slot id to the index in memslots[]. * -- cgit v1.2.3 From a54d806688fe1e482350ce759a8a0fc9ebf814b0 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 6 Dec 2021 20:54:30 +0100 Subject: KVM: Keep memslots in tree-based structures instead of array-based ones The current memslot code uses a (reverse gfn-ordered) memslot array for keeping track of them. Because the memslot array that is currently in use cannot be modified every memslot management operation (create, delete, move, change flags) has to make a copy of the whole array so it has a scratch copy to work on. Strictly speaking, however, it is only necessary to make copy of the memslot that is being modified, copying all the memslots currently present is just a limitation of the array-based memslot implementation. Two memslot sets, however, are still needed so the VM continues to run on the currently active set while the requested operation is being performed on the second, currently inactive one. In order to have two memslot sets, but only one copy of actual memslots it is necessary to split out the memslot data from the memslot sets. The memslots themselves should be also kept independent of each other so they can be individually added or deleted. These two memslot sets should normally point to the same set of memslots. They can, however, be desynchronized when performing a memslot management operation by replacing the memslot to be modified by its copy. After the operation is complete, both memslot sets once again point to the same, common set of memslot data. This commit implements the aforementioned idea. For tracking of gfns an ordinary rbtree is used since memslots cannot overlap in the guest address space and so this data structure is sufficient for ensuring that lookups are done quickly. The "last used slot" mini-caches (both per-slot set one and per-vCPU one), that keep track of the last found-by-gfn memslot, are still present in the new code. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Maciej S. Szmigiero Message-Id: <17c0cf3663b760a0d3753d4ac08c0753e941b811.1638817641.git.maciej.szmigiero@oracle.com> --- include/linux/kvm_host.h | 143 +++++++++++++++++++++++------------------------ 1 file changed, 71 insertions(+), 72 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9552ad6d6652..9eda8a63feae 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -358,11 +359,13 @@ struct kvm_vcpu { struct kvm_dirty_ring dirty_ring; /* - * The index of the most recently used memslot by this vCPU. It's ok - * if this becomes stale due to memslot changes since we always check - * it is a valid slot. + * The most recently used memslot by this vCPU and the slots generation + * for which it is valid. + * No wraparound protection is needed since generations won't overflow in + * thousands of years, even assuming 1M memslot operations per second. */ - int last_used_slot; + struct kvm_memory_slot *last_used_slot; + u64 last_used_slot_gen; }; /* must be called with irqs disabled */ @@ -427,9 +430,26 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) */ #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1) +/* + * Since at idle each memslot belongs to two memslot sets it has to contain + * two embedded nodes for each data structure that it forms a part of. + * + * Two memslot sets (one active and one inactive) are necessary so the VM + * continues to run on one memslot set while the other is being modified. + * + * These two memslot sets normally point to the same set of memslots. + * They can, however, be desynchronized when performing a memslot management + * operation by replacing the memslot to be modified by its copy. + * After the operation is complete, both memslot sets once again point to + * the same, common set of memslot data. + * + * The memslots themselves are independent of each other so they can be + * individually added or deleted. + */ struct kvm_memory_slot { - struct hlist_node id_node; - struct interval_tree_node hva_node; + struct hlist_node id_node[2]; + struct interval_tree_node hva_node[2]; + struct rb_node gfn_node[2]; gfn_t base_gfn; unsigned long npages; unsigned long *dirty_bitmap; @@ -524,16 +544,13 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) } #endif -/* - * Note: - * memslots are not sorted by id anymore, please use id_to_memslot() - * to get the memslot by its id. - */ struct kvm_memslots { u64 generation; + atomic_long_t last_used_slot; struct rb_root_cached hva_tree; + struct rb_root gfn_tree; /* - * The mapping table from slot id to the index in memslots[]. + * The mapping table from slot id to memslot. * * 7-bit bucket count matches the size of the old id to index array for * 512 slots, while giving good performance with this slot count. @@ -541,9 +558,7 @@ struct kvm_memslots { * always result in higher memory usage (even for lower memslot counts). */ DECLARE_HASHTABLE(id_hash, 7); - atomic_t last_used_slot; - int used_slots; - struct kvm_memory_slot memslots[]; + int node_idx; }; struct kvm { @@ -565,6 +580,9 @@ struct kvm { struct mutex slots_arch_lock; struct mm_struct *mm; /* userspace tied to this vm */ unsigned long nr_memslot_pages; + /* The two memslot sets - active and inactive (per address space) */ + struct kvm_memslots __memslots[KVM_ADDRESS_SPACE_NUM][2]; + /* The current active memslot set for each address space */ struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; struct xarray vcpu_array; @@ -739,11 +757,10 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) return NULL; } -#define kvm_for_each_memslot(memslot, slots) \ - for (memslot = &slots->memslots[0]; \ - memslot < slots->memslots + slots->used_slots; memslot++) \ - if (WARN_ON_ONCE(!memslot->npages)) { \ - } else +static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu) +{ + return vcpu->vcpu_idx; +} void kvm_destroy_vcpus(struct kvm *kvm); @@ -805,12 +822,23 @@ static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu) return __kvm_memslots(vcpu->kvm, as_id); } +static inline bool kvm_memslots_empty(struct kvm_memslots *slots) +{ + return RB_EMPTY_ROOT(&slots->gfn_tree); +} + +#define kvm_for_each_memslot(memslot, bkt, slots) \ + hash_for_each(slots->id_hash, bkt, memslot, id_node[slots->node_idx]) \ + if (WARN_ON_ONCE(!memslot->npages)) { \ + } else + static inline struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id) { struct kvm_memory_slot *slot; + int idx = slots->node_idx; - hash_for_each_possible(slots->id_hash, slot, id_node, id) { + hash_for_each_possible(slots->id_hash, slot, id_node[idx], id) { if (slot->id == id) return slot; } @@ -1214,25 +1242,15 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); /* - * Returns a pointer to the memslot at slot_index if it contains gfn. + * Returns a pointer to the memslot if it contains gfn. * Otherwise returns NULL. */ static inline struct kvm_memory_slot * -try_get_memslot(struct kvm_memslots *slots, int slot_index, gfn_t gfn) +try_get_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { - struct kvm_memory_slot *slot; - - if (slot_index < 0 || slot_index >= slots->used_slots) + if (!slot) return NULL; - /* - * slot_index can come from vcpu->last_used_slot which is not kept - * in sync with userspace-controllable memslot deletion. So use nospec - * to prevent the CPU from speculating past the end of memslots[]. - */ - slot_index = array_index_nospec(slot_index, slots->used_slots); - slot = &slots->memslots[slot_index]; - if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages) return slot; else @@ -1240,65 +1258,46 @@ try_get_memslot(struct kvm_memslots *slots, int slot_index, gfn_t gfn) } /* - * Returns a pointer to the memslot that contains gfn and records the index of - * the slot in index. Otherwise returns NULL. + * Returns a pointer to the memslot that contains gfn. Otherwise returns NULL. * * With "approx" set returns the memslot also when the address falls * in a hole. In that case one of the memslots bordering the hole is * returned. - * - * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! */ static inline struct kvm_memory_slot * -search_memslots(struct kvm_memslots *slots, gfn_t gfn, int *index, bool approx) +search_memslots(struct kvm_memslots *slots, gfn_t gfn, bool approx) { - int start = 0, end = slots->used_slots; - struct kvm_memory_slot *memslots = slots->memslots; struct kvm_memory_slot *slot; - - if (unlikely(!slots->used_slots)) - return NULL; - - while (start < end) { - int slot = start + (end - start) / 2; - - if (gfn >= memslots[slot].base_gfn) - end = slot; - else - start = slot + 1; - } - - if (approx && start >= slots->used_slots) { - *index = slots->used_slots - 1; - return &memslots[slots->used_slots - 1]; - } - - slot = try_get_memslot(slots, start, gfn); - if (slot) { - *index = start; - return slot; - } - if (approx) { - *index = start; - return &memslots[start]; + struct rb_node *node; + int idx = slots->node_idx; + + slot = NULL; + for (node = slots->gfn_tree.rb_node; node; ) { + slot = container_of(node, struct kvm_memory_slot, gfn_node[idx]); + if (gfn >= slot->base_gfn) { + if (gfn < slot->base_gfn + slot->npages) + return slot; + node = node->rb_right; + } else + node = node->rb_left; } - return NULL; + return approx ? slot : NULL; } static inline struct kvm_memory_slot * ____gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn, bool approx) { struct kvm_memory_slot *slot; - int slot_index = atomic_read(&slots->last_used_slot); - slot = try_get_memslot(slots, slot_index, gfn); + slot = (struct kvm_memory_slot *)atomic_long_read(&slots->last_used_slot); + slot = try_get_memslot(slot, gfn); if (slot) return slot; - slot = search_memslots(slots, gfn, &slot_index, approx); + slot = search_memslots(slots, gfn, approx); if (slot) { - atomic_set(&slots->last_used_slot, slot_index); + atomic_long_set(&slots->last_used_slot, (unsigned long)slot); return slot; } -- cgit v1.2.3 From f4209439b522432d140d33393d4a3f12e695527b Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 6 Dec 2021 20:54:32 +0100 Subject: KVM: Optimize gfn lookup in kvm_zap_gfn_range() Introduce a memslots gfn upper bound operation and use it to optimize kvm_zap_gfn_range(). This way this handler can do a quick lookup for intersecting gfns and won't have to do a linear scan of the whole memslot set. Signed-off-by: Maciej S. Szmigiero Message-Id: --- include/linux/kvm_host.h | 94 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9eda8a63feae..3bc98497e796 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -846,6 +846,100 @@ struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id) return NULL; } +/* Iterator used for walking memslots that overlap a gfn range. */ +struct kvm_memslot_iter { + struct kvm_memslots *slots; + struct rb_node *node; + struct kvm_memory_slot *slot; +}; + +static inline void kvm_memslot_iter_next(struct kvm_memslot_iter *iter) +{ + iter->node = rb_next(iter->node); + if (!iter->node) + return; + + iter->slot = container_of(iter->node, struct kvm_memory_slot, gfn_node[iter->slots->node_idx]); +} + +static inline void kvm_memslot_iter_start(struct kvm_memslot_iter *iter, + struct kvm_memslots *slots, + gfn_t start) +{ + int idx = slots->node_idx; + struct rb_node *tmp; + struct kvm_memory_slot *slot; + + iter->slots = slots; + + /* + * Find the so called "upper bound" of a key - the first node that has + * its key strictly greater than the searched one (the start gfn in our case). + */ + iter->node = NULL; + for (tmp = slots->gfn_tree.rb_node; tmp; ) { + slot = container_of(tmp, struct kvm_memory_slot, gfn_node[idx]); + if (start < slot->base_gfn) { + iter->node = tmp; + tmp = tmp->rb_left; + } else { + tmp = tmp->rb_right; + } + } + + /* + * Find the slot with the lowest gfn that can possibly intersect with + * the range, so we'll ideally have slot start <= range start + */ + if (iter->node) { + /* + * A NULL previous node means that the very first slot + * already has a higher start gfn. + * In this case slot start > range start. + */ + tmp = rb_prev(iter->node); + if (tmp) + iter->node = tmp; + } else { + /* a NULL node below means no slots */ + iter->node = rb_last(&slots->gfn_tree); + } + + if (iter->node) { + iter->slot = container_of(iter->node, struct kvm_memory_slot, gfn_node[idx]); + + /* + * It is possible in the slot start < range start case that the + * found slot ends before or at range start (slot end <= range start) + * and so it does not overlap the requested range. + * + * In such non-overlapping case the next slot (if it exists) will + * already have slot start > range start, otherwise the logic above + * would have found it instead of the current slot. + */ + if (iter->slot->base_gfn + iter->slot->npages <= start) + kvm_memslot_iter_next(iter); + } +} + +static inline bool kvm_memslot_iter_is_valid(struct kvm_memslot_iter *iter, gfn_t end) +{ + if (!iter->node) + return false; + + /* + * If this slot starts beyond or at the end of the range so does + * every next one + */ + return iter->slot->base_gfn < end; +} + +/* Iterate over each memslot at least partially intersecting [start, end) range */ +#define kvm_for_each_memslot_in_gfn_range(iter, slots, start, end) \ + for (kvm_memslot_iter_start(iter, slots, start); \ + kvm_memslot_iter_is_valid(iter, end); \ + kvm_memslot_iter_next(iter)) + /* * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations: * - create a new memory slot -- cgit v1.2.3 From 8283e36abfff507c64fe8289ac30ea7ab59648aa Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 15 Nov 2021 15:45:58 -0800 Subject: KVM: x86/mmu: Propagate memslot const qualifier In preparation for implementing in-place hugepage promotion, various functions will need to be called from zap_collapsible_spte_range, which has the const qualifier on its memslot argument. Propagate the const qualifier to the various functions which will be needed. This just serves to simplify the following patch. No functional change intended. Signed-off-by: Ben Gardon Message-Id: <20211115234603.2908381-11-bgardon@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3bc98497e796..3eb7695aaa73 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -460,7 +460,7 @@ struct kvm_memory_slot { u16 as_id; }; -static inline bool kvm_slot_dirty_track_enabled(struct kvm_memory_slot *slot) +static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot) { return slot->flags & KVM_MEM_LOG_DIRTY_PAGES; } @@ -994,9 +994,9 @@ void kvm_set_page_accessed(struct page *page); kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); -kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); -kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); -kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, +kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn); +kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn); +kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, bool *async, bool write_fault, bool *writable, hva_t *hva); @@ -1073,7 +1073,7 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); -void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t gfn); +void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 510958e997217e39a16b47afb5a44dfa39013964 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 19:11:57 -0700 Subject: KVM: Force PPC to define its own rcuwait object Do not define/reference kvm_vcpu.wait if __KVM_HAVE_ARCH_WQP is true, and instead force the architecture (PPC) to define its own rcuwait object. Allowing common KVM to directly access vcpu->wait without a guard makes it all too easy to introduce potential bugs, e.g. kvm_vcpu_block(), kvm_vcpu_on_spin(), and async_pf_execute() all operate on vcpu->wait, not the result of kvm_arch_vcpu_get_wait(), and so may do the wrong thing for PPC. Due to PPC's shenanigans with respect to callbacks and waits (it switches to the virtual core's wait object at KVM_RUN!?!?), it's not clear whether or not this fixes any bugs. Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3eb7695aaa73..afacbfb2e482 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -314,7 +314,9 @@ struct kvm_vcpu { struct mutex mutex; struct kvm_run *run; +#ifndef __KVM_HAVE_ARCH_WQP struct rcuwait wait; +#endif struct pid __rcu *pid; int sigset_active; sigset_t sigset; -- cgit v1.2.3 From 91b99ea7065786d0bff1c9281b002455dbaeb08b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 19:12:06 -0700 Subject: KVM: Rename kvm_vcpu_block() => kvm_vcpu_halt() Rename kvm_vcpu_block() to kvm_vcpu_halt() in preparation for splitting the actual "block" sequences into a separate helper (to be named kvm_vcpu_block()). x86 will use the standalone block-only path to handle non-halt cases where the vCPU is not runnable. Rename block_ns to halt_ns to match the new function name. No functional change intended. Reviewed-by: David Matlack Reviewed-by: Christian Borntraeger Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-14-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index afacbfb2e482..ea3c22d55d56 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1102,7 +1102,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); void kvm_sigset_activate(struct kvm_vcpu *vcpu); void kvm_sigset_deactivate(struct kvm_vcpu *vcpu); -void kvm_vcpu_block(struct kvm_vcpu *vcpu); +void kvm_vcpu_halt(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu); bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From fac4268894394213127e43856f41d10f29131e69 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 19:12:07 -0700 Subject: KVM: Split out a kvm_vcpu_block() helper from kvm_vcpu_halt() Factor out the "block" part of kvm_vcpu_halt() so that x86 can emulate non-halt wait/sleep/block conditions that should not be subjected to halt-polling. No functional change intended. Reviewed-by: Christian Borntraeger Reviewed-by: David Matlack Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-15-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ea3c22d55d56..bd13c5b5bd1d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1103,6 +1103,7 @@ void kvm_sigset_activate(struct kvm_vcpu *vcpu); void kvm_sigset_deactivate(struct kvm_vcpu *vcpu); void kvm_vcpu_halt(struct kvm_vcpu *vcpu); +bool kvm_vcpu_block(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu); bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From c3858335c711569b82a234a560dc19247e8f3fcc Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Fri, 8 Oct 2021 19:12:08 -0700 Subject: KVM: stats: Add stat to detect if vcpu is currently blocking Add a "blocking" stat that userspace can use to detect the case where a vCPU is not being run because of an vCPU/guest action, e.g. HLT or WFS on x86, WFI on arm64, etc... Current guest/host/halt stats don't show this well, e.g. if a guest halts for a long period of time then the vCPU could could appear pathologically blocked due to a host condition, when in reality the vCPU has been put into a not-runnable state by the guest. Originally-by: Cannon Matthews Suggested-by: Sean Christopherson Reviewed-by: David Matlack Signed-off-by: Jing Zhang [sean: renamed stat to "blocking", massaged changelog] Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-16-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 3 ++- include/linux/kvm_types.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index bd13c5b5bd1d..dc7740cafea7 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1587,7 +1587,8 @@ struct _kvm_stats_desc { STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_hist, \ HALT_POLL_HIST_COUNT), \ STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_wait_hist, \ - HALT_POLL_HIST_COUNT) + HALT_POLL_HIST_COUNT), \ + STATS_DESC_ICOUNTER(VCPU_GENERIC, blocking) extern struct dentry *kvm_debugfs_dir; diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 234eab059839..888ef12862c9 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -87,6 +87,7 @@ struct kvm_vcpu_stat_generic { u64 halt_poll_success_hist[HALT_POLL_HIST_COUNT]; u64 halt_poll_fail_hist[HALT_POLL_HIST_COUNT]; u64 halt_wait_hist[HALT_POLL_HIST_COUNT]; + u64 blocking; }; #define KVM_STATS_NAME_SIZE 48 -- cgit v1.2.3 From d92a5d1c6c757f659ffb9c2c2e65fcf3d571c14e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Oct 2021 19:12:12 -0700 Subject: KVM: Add helpers to wake/query blocking vCPU Add helpers to wake and query a blocking vCPU. In addition to providing nice names, the helpers reduce the probability of KVM neglecting to use kvm_arch_vcpu_get_wait(). No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20211009021236.4122790-20-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index dc7740cafea7..f8ed799e8674 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1286,6 +1286,20 @@ static inline struct rcuwait *kvm_arch_vcpu_get_wait(struct kvm_vcpu *vcpu) #endif } +/* + * Wake a vCPU if necessary, but don't do any stats/metadata updates. Returns + * true if the vCPU was blocking and was awakened, false otherwise. + */ +static inline bool __kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) +{ + return !!rcuwait_wake_up(kvm_arch_vcpu_get_wait(vcpu)); +} + +static inline bool kvm_vcpu_is_blocking(struct kvm_vcpu *vcpu) +{ + return rcuwait_active(kvm_arch_vcpu_get_wait(vcpu)); +} + #ifdef __KVM_HAVE_ARCH_INTC_INITIALIZED /* * returns true if the virtual interrupt controller is initialized and -- cgit v1.2.3 From 815f0e738a8d5663a02350e2580706829144a722 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Wed, 3 Nov 2021 09:50:59 +0100 Subject: clk: gate: Add devm_clk_hw_register_gate() Add devm_clk_hw_register_gate() - devres-managed version of clk_hw_register_gate() Suggested-by: Stephen Boyd Signed-off-by: Horatiu Vultur Acked-by: Nicolas Ferre Signed-off-by: Nicolas Ferre Link: https://lore.kernel.org/r/20211103085102.1656081-2-horatiu.vultur@microchip.com --- include/linux/clk-provider.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index f59c875271a0..2faa6f7aa8a8 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -490,6 +490,13 @@ struct clk_hw *__clk_hw_register_gate(struct device *dev, unsigned long flags, void __iomem *reg, u8 bit_idx, u8 clk_gate_flags, spinlock_t *lock); +struct clk_hw *__devm_clk_hw_register_gate(struct device *dev, + struct device_node *np, const char *name, + const char *parent_name, const struct clk_hw *parent_hw, + const struct clk_parent_data *parent_data, + unsigned long flags, + void __iomem *reg, u8 bit_idx, + u8 clk_gate_flags, spinlock_t *lock); struct clk *clk_register_gate(struct device *dev, const char *name, const char *parent_name, unsigned long flags, void __iomem *reg, u8 bit_idx, @@ -544,6 +551,22 @@ struct clk *clk_register_gate(struct device *dev, const char *name, __clk_hw_register_gate((dev), NULL, (name), NULL, NULL, (parent_data), \ (flags), (reg), (bit_idx), \ (clk_gate_flags), (lock)) +/** + * devm_clk_hw_register_gate - register a gate clock with the clock framework + * @dev: device that is registering this clock + * @name: name of this clock + * @parent_name: name of this clock's parent + * @flags: framework-specific flags for this clock + * @reg: register address to control gating of this clock + * @bit_idx: which bit in the register controls gating of this clock + * @clk_gate_flags: gate-specific flags for this clock + * @lock: shared register lock for this clock + */ +#define devm_clk_hw_register_gate(dev, name, parent_name, flags, reg, bit_idx,\ + clk_gate_flags, lock) \ + __devm_clk_hw_register_gate((dev), NULL, (name), (parent_name), NULL, \ + NULL, (flags), (reg), (bit_idx), \ + (clk_gate_flags), (lock)) void clk_unregister_gate(struct clk *clk); void clk_hw_unregister_gate(struct clk_hw *hw); int clk_gate_is_enabled(struct clk_hw *hw); -- cgit v1.2.3 From 444dd878e85fb33fcfb2682cfdab4c236f33ea3e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 3 Dec 2021 17:19:47 +0100 Subject: PM: runtime: Fix pm_runtime_active() kerneldoc comment The kerneldoc comment of pm_runtime_active() does not reflect the behavior of the function, so update it accordingly. Fixes: 403d2d116ec0 ("PM: runtime: Add kerneldoc comments to multiple helpers") Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson --- include/linux/pm_runtime.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 222da43b7096..eddd66d426ca 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -129,7 +129,7 @@ static inline bool pm_runtime_suspended(struct device *dev) * pm_runtime_active - Check whether or not a device is runtime-active. * @dev: Target device. * - * Return %true if runtime PM is enabled for @dev and its runtime PM status is + * Return %true if runtime PM is disabled for @dev or its runtime PM status is * %RPM_ACTIVE, or %false otherwise. * * Note that the return value of this function can only be trusted if it is -- cgit v1.2.3 From 74d9555580c48a04b2c3b742dfb0c80777aa0b26 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 8 Nov 2021 16:09:41 +0000 Subject: PM: hibernate: Allow ACPI hardware signature to be honoured Theoretically, when the hardware signature in FACS changes, the OS is supposed to gracefully decline to attempt to resume from S4: "If the signature has changed, OSPM will not restore the system context and can boot from scratch" In practice, Windows doesn't do this and many laptop vendors do allow the signature to change especially when docking/undocking, so it would be a bad idea to simply comply with the specification by default in the general case. However, there are use cases where we do want the compliant behaviour and we know it's safe. Specifically, when resuming virtual machines where we know the hypervisor has changed sufficiently that resume will fail. We really want to be able to *tell* the guest kernel not to try, so it boots cleanly and doesn't just crash. This patch provides a way to opt in to the spec-compliant behaviour on the command line. A follow-up patch may do this automatically for certain "known good" machines based on a DMI match, or perhaps just for all hypervisor guests since there's no good reason a hypervisor would change the hardware_signature that it exposes to guests *unless* it wants them to obey the ACPI specification. Signed-off-by: David Woodhouse Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 2 +- include/linux/suspend.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index b28f8790192a..6c0798db6bde 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -506,7 +506,7 @@ acpi_status acpi_release_memory(acpi_handle handle, struct resource *res, int acpi_resources_are_enforced(void); #ifdef CONFIG_HIBERNATION -void __init acpi_no_s4_hw_signature(void); +void __init acpi_check_s4_hw_signature(int check); #endif #ifdef CONFIG_PM_SLEEP diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 8af13ba60c7e..5785d909c321 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -446,6 +446,7 @@ extern unsigned long get_safe_page(gfp_t gfp_mask); extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); +extern u32 swsusp_hardware_signature; extern void hibernation_set_ops(const struct platform_hibernation_ops *ops); extern int hibernate(void); extern bool system_entering_hibernation(void); -- cgit v1.2.3 From 8260b9820f7050461b8969305bbd8cb5654f0c74 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Mon, 6 Dec 2021 16:55:03 +0300 Subject: x86/sev: Use CC_ATTR attribute to generalize string I/O unroll INS/OUTS are not supported in TDX guests and cause #UD. Kernel has to avoid them when running in TDX guest. To support existing usage, string I/O operations are unrolled using IN/OUT instructions. AMD SEV platform implements this support by adding unroll logic in ins#bwl()/outs#bwl() macros with SEV-specific checks. Since TDX VM guests will also need similar support, use CC_ATTR_GUEST_UNROLL_STRING_IO and generic cc_platform_has() API to implement it. String I/O helpers were the last users of sev_key_active() interface and sev_enable_key static key. Remove them. [ bp: Move comment too and do not delete it. ] Suggested-by: Tom Lendacky Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Borislav Petkov Reviewed-by: Tony Luck Reviewed-by: Tom Lendacky Tested-by: Tom Lendacky Link: https://lkml.kernel.org/r/20211206135505.75045-2-kirill.shutemov@linux.intel.com --- include/linux/cc_platform.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index a075b70b9a70..efd8205282da 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -61,6 +61,17 @@ enum cc_attr { * Examples include SEV-ES. */ CC_ATTR_GUEST_STATE_ENCRYPT, + + /** + * @CC_ATTR_GUEST_UNROLL_STRING_IO: String I/O is implemented with + * IN/OUT instructions + * + * The platform/OS is running as a guest/virtual machine and uses + * IN/OUT instructions in place of string I/O. + * + * Examples include TDX guest & SEV. + */ + CC_ATTR_GUEST_UNROLL_STRING_IO, }; #ifdef CONFIG_ARCH_HAS_CC_PLATFORM -- cgit v1.2.3 From 3f9bb0301d50ce27421eff4b710c2bbe58111a83 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 6 Dec 2021 18:57:47 +0200 Subject: net: dsa: make dp->bridge_num one-based MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I have seen too many bugs already due to the fact that we must encode an invalid dp->bridge_num as a negative value, because the natural tendency is to check that invalid value using (!dp->bridge_num). Latest example can be seen in commit 1bec0f05062c ("net: dsa: fix bridge_num not getting cleared after ports leaving the bridge"). Convert the existing users to assume that dp->bridge_num == 0 is the encoding for invalid, and valid bridge numbers start from 1. Signed-off-by: Vladimir Oltean Reviewed-by: Alvin Šipraga Signed-off-by: Jakub Kicinski --- include/linux/dsa/8021q.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 254b165f2b44..0af4371fbebb 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -38,13 +38,13 @@ void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id); int dsa_tag_8021q_bridge_tx_fwd_offload(struct dsa_switch *ds, int port, struct net_device *br, - int bridge_num); + unsigned int bridge_num); void dsa_tag_8021q_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port, struct net_device *br, - int bridge_num); + unsigned int bridge_num); -u16 dsa_8021q_bridge_tx_fwd_offload_vid(int bridge_num); +u16 dsa_8021q_bridge_tx_fwd_offload_vid(unsigned int bridge_num); u16 dsa_tag_8021q_tx_vid(const struct dsa_port *dp); -- cgit v1.2.3 From d3eed0e57d5d1bcbf1bd60f83a4adfe7d7b8dd9c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 6 Dec 2021 18:57:56 +0200 Subject: net: dsa: keep the bridge_dev and bridge_num as part of the same structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The main desire behind this is to provide coherent bridge information to the fast path without locking. For example, right now we set dp->bridge_dev and dp->bridge_num from separate code paths, it is theoretically possible for a packet transmission to read these two port properties consecutively and find a bridge number which does not correspond with the bridge device. Another desire is to start passing more complex bridge information to dsa_switch_ops functions. For example, with FDB isolation, it is expected that drivers will need to be passed the bridge which requested an FDB/MDB entry to be offloaded, and along with that bridge_dev, the associated bridge_num should be passed too, in case the driver might want to implement an isolation scheme based on that number. We already pass the {bridge_dev, bridge_num} pair to the TX forwarding offload switch API, however we'd like to remove that and squash it into the basic bridge join/leave API. So that means we need to pass this pair to the bridge join/leave API. During dsa_port_bridge_leave, first we unset dp->bridge_dev, then we call the driver's .port_bridge_leave with what used to be our dp->bridge_dev, but provided as an argument. When bridge_dev and bridge_num get folded into a single structure, we need to preserve this behavior in dsa_port_bridge_leave: we need a copy of what used to be in dp->bridge. Switch drivers check bridge membership by comparing dp->bridge_dev with the provided bridge_dev, but now, if we provide the struct dsa_bridge as a pointer, they cannot keep comparing dp->bridge to the provided pointer, since this only points to an on-stack copy. To make this obvious and prevent driver writers from forgetting and doing stupid things, in this new API, the struct dsa_bridge is provided as a full structure (not very large, contains an int and a pointer) instead of a pointer. An explicit comparison function needs to be used to determine bridge membership: dsa_port_offloads_bridge(). Signed-off-by: Vladimir Oltean Reviewed-by: Alvin Šipraga Signed-off-by: Jakub Kicinski --- include/linux/dsa/8021q.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 0af4371fbebb..939a1beaddf7 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -7,6 +7,7 @@ #include #include +#include struct dsa_switch; struct dsa_port; @@ -37,12 +38,10 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id); int dsa_tag_8021q_bridge_tx_fwd_offload(struct dsa_switch *ds, int port, - struct net_device *br, - unsigned int bridge_num); + struct dsa_bridge bridge); void dsa_tag_8021q_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port, - struct net_device *br, - unsigned int bridge_num); + struct dsa_bridge bridge); u16 dsa_8021q_bridge_tx_fwd_offload_vid(unsigned int bridge_num); -- cgit v1.2.3 From 283e6f5a8166f075eba78da6b867d76cc5d47e77 Mon Sep 17 00:00:00 2001 From: Sergey Ryazanov Date: Tue, 7 Dec 2021 12:21:40 +0300 Subject: net: wwan: make debugfs optional Debugfs interface is optional for the regular modem use. Some distros and users will want to disable this feature for security or kernel size reasons. So add a configuration option that allows to completely disable the debugfs interface of the WWAN devices. A primary considered use case for this option was embedded firmwares. For example, in OpenWrt, you can not completely disable debugfs, as a lot of wireless stuff can only be configured and monitored with the debugfs knobs. At the same time, reducing the size of a kernel and modules is an essential task in the world of embedded software. Disabling the WWAN and IOSM debugfs interfaces allows us to save 50K (x86-64 build) of space for module storage. Not much, but already considerable when you only have 16MB of storage. So it is hard to just disable whole debugfs. Users need some fine grained set of options to control which debugfs interface is important and should be available and which is not. The new configuration symbol is enabled by default and is hidden under the EXPERT option. So a regular user would not be bothered by another one configuration question. While an embedded distro maintainer will be able to a little more reduce the final image size. Signed-off-by: Sergey Ryazanov Reviewed-by: Leon Romanovsky Reviewed-by: Loic Poulain Acked-by: M Chetan Kumar Signed-off-by: Jakub Kicinski --- include/linux/wwan.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wwan.h b/include/linux/wwan.h index 1646aa3e6779..e143c88bf4b0 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -171,6 +171,13 @@ int wwan_register_ops(struct device *parent, const struct wwan_ops *ops, void wwan_unregister_ops(struct device *parent); +#ifdef CONFIG_WWAN_DEBUGFS struct dentry *wwan_get_debugfs_dir(struct device *parent); +#else +static inline struct dentry *wwan_get_debugfs_dir(struct device *parent) +{ + return ERR_PTR(-ENODEV); +} +#endif #endif /* __WWAN_H */ -- cgit v1.2.3 From 1197528aaea79ed4909aba695d18fdecc5387a36 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:27:28 +0100 Subject: genirq/msi: Guard sysfs code No point in building unused code when CONFIG_SYSFS=n. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Reviewed-by: Jason Gunthorpe Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20211206210223.985907940@linutronix.de --- include/linux/msi.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index e616f94c7c58..d43b9469c88b 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -239,9 +239,19 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void pci_msi_mask_irq(struct irq_data *data); void pci_msi_unmask_irq(struct irq_data *data); +#ifdef CONFIG_SYSFS const struct attribute_group **msi_populate_sysfs(struct device *dev); void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups); +#else +static inline const struct attribute_group **msi_populate_sysfs(struct device *dev) +{ + return NULL; +} +static inline void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups) +{ +} +#endif /* * The arch hooks to setup up msi irqs. Default functions are implemented -- cgit v1.2.3 From 1dd2c6a0817fd08f80dee75d7d3bd99a0c4b828d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:27:29 +0100 Subject: genirq/msi: Remove unused domain callbacks No users and there is no need to grow them. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211126223824.322987915@linutronix.de Link: https://lore.kernel.org/r/20211206210224.041777889@linutronix.de --- include/linux/msi.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index d43b9469c88b..4b962f73f84a 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -304,7 +304,6 @@ struct msi_domain_info; * @msi_free: Domain specific function to free a MSI interrupts * @msi_check: Callback for verification of the domain/info/dev data * @msi_prepare: Prepare the allocation of the interrupts in the domain - * @msi_finish: Optional callback to finalize the allocation * @set_desc: Set the msi descriptor for an interrupt * @handle_error: Optional error handler if the allocation fails * @domain_alloc_irqs: Optional function to override the default allocation @@ -312,12 +311,11 @@ struct msi_domain_info; * @domain_free_irqs: Optional function to override the default free * function. * - * @get_hwirq, @msi_init and @msi_free are callbacks used by - * msi_create_irq_domain() and related interfaces + * @get_hwirq, @msi_init and @msi_free are callbacks used by the underlying + * irqdomain. * - * @msi_check, @msi_prepare, @msi_finish, @set_desc and @handle_error - * are callbacks used by msi_domain_alloc_irqs() and related - * interfaces which are based on msi_desc. + * @msi_check, @msi_prepare, @handle_error and @set_desc are callbacks used by + * msi_domain_alloc/free_irqs(). * * @domain_alloc_irqs, @domain_free_irqs can be used to override the * default allocation/free functions (__msi_domain_alloc/free_irqs). This @@ -351,7 +349,6 @@ struct msi_domain_ops { int (*msi_prepare)(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *arg); - void (*msi_finish)(msi_alloc_info_t *arg, int retval); void (*set_desc)(msi_alloc_info_t *arg, struct msi_desc *desc); int (*handle_error)(struct irq_domain *domain, -- cgit v1.2.3 From 3ba1f050c91d5ce3672dbf3a55dc2451c0b342e2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:27:31 +0100 Subject: genirq/msi: Fixup includes Remove the kobject.h include from msi.h as it's not required and add a sysfs.h include to the core code instead. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Reviewed-by: Jason Gunthorpe Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20211206210224.103502021@linutronix.de --- include/linux/msi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 4b962f73f84a..5c627750f269 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -2,7 +2,7 @@ #ifndef LINUX_MSI_H #define LINUX_MSI_H -#include +#include #include #include -- cgit v1.2.3 From 9e8688c5f2990dadcf83728cd00a7e8497fc6aa9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:27:33 +0100 Subject: PCI/MSI: Make pci_msi_domain_write_msg() static There is no point to have this function public as it is set by the PCI core anyway when a PCI/MSI irqdomain is created. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Reviewed-by: Jason Gunthorpe Acked-by: Bjorn Helgaas # PCI Link: https://lore.kernel.org/r/20211206210224.157070464@linutronix.de --- include/linux/msi.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 5c627750f269..d7b143a79cb4 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -455,7 +455,6 @@ void *platform_msi_get_host_data(struct irq_domain *domain); #endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */ #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN -void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *msg); struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent); -- cgit v1.2.3 From ade044a3d0f0389e4f916337c505550acc3fd011 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:27:34 +0100 Subject: PCI/MSI: Remove msi_desc_to_pci_sysdata() Last user is gone long ago. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Reviewed-by: Jason Gunthorpe Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20211206210224.210768199@linutronix.de --- include/linux/msi.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index d7b143a79cb4..ac6fec105edc 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -218,13 +218,8 @@ static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, for_each_msi_entry((desc), &(pdev)->dev) struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc); -void *msi_desc_to_pci_sysdata(struct msi_desc *desc); void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg); #else /* CONFIG_PCI_MSI */ -static inline void *msi_desc_to_pci_sysdata(struct msi_desc *desc) -{ - return NULL; -} static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg) { } -- cgit v1.2.3 From e58f2259b91c02974c20db7b28d39d810a21249b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:27:39 +0100 Subject: genirq/msi, treewide: Use a named struct for PCI/MSI attributes The unnamed struct sucks and is in the way of further cleanups. Stick the PCI related MSI data into a real data structure and cleanup all users. No functional change. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Reviewed-by: Jason Gunthorpe Reviewed-by: Greg Kroah-Hartman Acked-by: Kalle Valo Link: https://lore.kernel.org/r/20211206210224.374863119@linutronix.de --- include/linux/msi.h | 84 ++++++++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index ac6fec105edc..7e5c13f4e41b 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -68,6 +68,42 @@ static inline void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) typedef void (*irq_write_msi_msg_t)(struct msi_desc *desc, struct msi_msg *msg); +/** + * pci_msi_desc - PCI/MSI specific MSI descriptor data + * + * @msi_mask: [PCI MSI] MSI cached mask bits + * @msix_ctrl: [PCI MSI-X] MSI-X cached per vector control bits + * @is_msix: [PCI MSI/X] True if MSI-X + * @multiple: [PCI MSI/X] log2 num of messages allocated + * @multi_cap: [PCI MSI/X] log2 num of messages supported + * @can_mask: [PCI MSI/X] Masking supported? + * @is_64: [PCI MSI/X] Address size: 0=32bit 1=64bit + * @entry_nr: [PCI MSI/X] Entry which is described by this descriptor + * @default_irq:[PCI MSI/X] The default pre-assigned non-MSI irq + * @mask_pos: [PCI MSI] Mask register position + * @mask_base: [PCI MSI-X] Mask register base address + */ +struct pci_msi_desc { + union { + u32 msi_mask; + u32 msix_ctrl; + }; + struct { + u8 is_msix : 1; + u8 multiple : 3; + u8 multi_cap : 3; + u8 can_mask : 1; + u8 is_64 : 1; + u8 is_virtual : 1; + u16 entry_nr; + unsigned default_irq; + } msi_attrib; + union { + u8 mask_pos; + void __iomem *mask_base; + }; +}; + /** * platform_msi_desc - Platform device specific msi descriptor data * @msi_priv_data: Pointer to platform private data @@ -107,17 +143,7 @@ struct ti_sci_inta_msi_desc { * address or data changes * @write_msi_msg_data: Data parameter for the callback. * - * @msi_mask: [PCI MSI] MSI cached mask bits - * @msix_ctrl: [PCI MSI-X] MSI-X cached per vector control bits - * @is_msix: [PCI MSI/X] True if MSI-X - * @multiple: [PCI MSI/X] log2 num of messages allocated - * @multi_cap: [PCI MSI/X] log2 num of messages supported - * @maskbit: [PCI MSI/X] Mask-Pending bit supported? - * @is_64: [PCI MSI/X] Address size: 0=32bit 1=64bit - * @entry_nr: [PCI MSI/X] Entry which is described by this descriptor - * @default_irq:[PCI MSI/X] The default pre-assigned non-MSI irq - * @mask_pos: [PCI MSI] Mask register position - * @mask_base: [PCI MSI-X] Mask register base address + * @pci: [PCI] PCI speficic msi descriptor data * @platform: [platform] Platform device specific msi descriptor data * @fsl_mc: [fsl-mc] FSL MC device specific msi descriptor data * @inta: [INTA] TISCI based INTA specific msi descriptor data @@ -138,38 +164,10 @@ struct msi_desc { void *write_msi_msg_data; union { - /* PCI MSI/X specific data */ - struct { - union { - u32 msi_mask; - u32 msix_ctrl; - }; - struct { - u8 is_msix : 1; - u8 multiple : 3; - u8 multi_cap : 3; - u8 can_mask : 1; - u8 is_64 : 1; - u8 is_virtual : 1; - u16 entry_nr; - unsigned default_irq; - } msi_attrib; - union { - u8 mask_pos; - void __iomem *mask_base; - }; - }; - - /* - * Non PCI variants add their data structure here. New - * entries need to use a named structure. We want - * proper name spaces for this. The PCI part is - * anonymous for now as it would require an immediate - * tree wide cleanup. - */ - struct platform_msi_desc platform; - struct fsl_mc_msi_desc fsl_mc; - struct ti_sci_inta_msi_desc inta; + struct pci_msi_desc pci; + struct platform_msi_desc platform; + struct fsl_mc_msi_desc fsl_mc; + struct ti_sci_inta_msi_desc inta; }; }; -- cgit v1.2.3 From ae72f3156729541581f526b85883ca53a20df2fa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:27:42 +0100 Subject: PCI/MSI: Make arch_restore_msi_irqs() less horrible. Make arch_restore_msi_irqs() return a boolean which indicates whether the core code should restore the MSI message or not. Get rid of the indirection in x86. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Reviewed-by: Jason Gunthorpe Acked-by: Bjorn Helgaas # PCI Link: https://lore.kernel.org/r/20211206210224.485668098@linutronix.de --- include/linux/msi.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 7e5c13f4e41b..673899fc24f6 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -273,11 +273,10 @@ static inline void arch_teardown_msi_irqs(struct pci_dev *dev) #endif /* - * The restore hooks are still available as they are useful even - * for fully irq domain based setups. Courtesy to XEN/X86. + * The restore hook is still available even for fully irq domain based + * setups. Courtesy to XEN/X86. */ -void arch_restore_msi_irqs(struct pci_dev *dev); -void default_restore_msi_irqs(struct pci_dev *dev); +bool arch_restore_msi_irqs(struct pci_dev *dev); #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN -- cgit v1.2.3 From aa423ac4221abdfb8588751e7838ca5f42f56db3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:27:52 +0100 Subject: PCI/MSI: Split out irqdomain code Move the irqdomain specific code into its own file. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Reviewed-by: Jason Gunthorpe Reviewed-by: Greg Kroah-Hartman Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20211206210224.817754783@linutronix.de --- include/linux/msi.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 673899fc24f6..7ff7cf23b78d 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -259,17 +259,6 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc); void arch_teardown_msi_irq(unsigned int irq); int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void arch_teardown_msi_irqs(struct pci_dev *dev); -#else -static inline int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) -{ - WARN_ON_ONCE(1); - return -ENODEV; -} - -static inline void arch_teardown_msi_irqs(struct pci_dev *dev) -{ - WARN_ON_ONCE(1); -} #endif /* -- cgit v1.2.3 From 85aa607e79f8343f1ea028b29bdf8b6bc99c729a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:27:54 +0100 Subject: PCI/MSI: Sanitize MSI-X table map handling Unmapping the MSI-X base mapping in the loops which allocate/free MSI descriptors is daft and in the way of allowing runtime expansion of MSI-X descriptors. Store the mapping in struct pci_dev and free it after freeing the MSI-X descriptors. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Reviewed-by: Jason Gunthorpe Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20211206210224.871651518@linutronix.de --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 18a75c8e615c..8cb103677f5a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -473,6 +473,7 @@ struct pci_dev { u8 ptm_granularity; #endif #ifdef CONFIG_PCI_MSI + void __iomem *msix_base; const struct attribute_group **msi_irq_groups; #endif struct pci_vpd vpd; -- cgit v1.2.3 From cd119b09a87d8beb50356d8c5c6aa42d89c44eb7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:27:56 +0100 Subject: PCI/MSI: Move msi_lock to struct pci_dev It's only required for PCI/MSI. So no point in having it in every struct device. Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20211206210224.925241961@linutronix.de --- include/linux/device.h | 2 -- include/linux/pci.h | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index e270cb740b9e..2a22875238a6 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -407,7 +407,6 @@ struct dev_links_info { * @em_pd: device's energy model performance domain * @pins: For device pin management. * See Documentation/driver-api/pin-control.rst for details. - * @msi_lock: Lock to protect MSI mask cache and mask register * @msi_list: Hosts MSI descriptors * @msi_domain: The generic MSI domain this device is using. * @numa_node: NUMA node this device is close to. @@ -508,7 +507,6 @@ struct device { struct dev_pin_info *pins; #endif #ifdef CONFIG_GENERIC_MSI_IRQ - raw_spinlock_t msi_lock; struct list_head msi_list; #endif #ifdef CONFIG_DMA_OPS diff --git a/include/linux/pci.h b/include/linux/pci.h index 8cb103677f5a..5cc46baef519 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -474,6 +474,7 @@ struct pci_dev { #endif #ifdef CONFIG_PCI_MSI void __iomem *msix_base; + raw_spinlock_t msi_lock; const struct attribute_group **msi_irq_groups; #endif struct pci_vpd vpd; -- cgit v1.2.3 From 57ce3a3c99b21e9c4f951ef01e0a3603c987c259 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:27:57 +0100 Subject: PCI/MSI: Make pci_msi_domain_check_cap() static No users outside of that file. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Reviewed-by: Jason Gunthorpe Reviewed-by: Greg Kroah-Hartman Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20211206210224.980989243@linutronix.de --- include/linux/msi.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 7ff7cf23b78d..5248678e05d1 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -439,8 +439,6 @@ void *platform_msi_get_host_data(struct irq_domain *domain); struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent); -int pci_msi_domain_check_cap(struct irq_domain *domain, - struct msi_domain_info *info, struct device *dev); u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev); struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev); bool pci_dev_has_special_msi_domain(struct pci_dev *pdev); -- cgit v1.2.3 From 890337624e1fa2da079fc1c036a62d178c985280 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:27:59 +0100 Subject: genirq/msi: Handle PCI/MSI allocation fail in core code Get rid of yet another irqdomain callback and let the core code return the already available information of how many descriptors could be allocated. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Reviewed-by: Jason Gunthorpe Reviewed-by: Greg Kroah-Hartman Acked-by: Bjorn Helgaas # PCI Link: https://lore.kernel.org/r/20211206210225.046615302@linutronix.de --- include/linux/msi.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 5248678e05d1..ba4a39c430b5 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -286,7 +286,6 @@ struct msi_domain_info; * @msi_check: Callback for verification of the domain/info/dev data * @msi_prepare: Prepare the allocation of the interrupts in the domain * @set_desc: Set the msi descriptor for an interrupt - * @handle_error: Optional error handler if the allocation fails * @domain_alloc_irqs: Optional function to override the default allocation * function. * @domain_free_irqs: Optional function to override the default free @@ -295,7 +294,7 @@ struct msi_domain_info; * @get_hwirq, @msi_init and @msi_free are callbacks used by the underlying * irqdomain. * - * @msi_check, @msi_prepare, @handle_error and @set_desc are callbacks used by + * @msi_check, @msi_prepare and @set_desc are callbacks used by * msi_domain_alloc/free_irqs(). * * @domain_alloc_irqs, @domain_free_irqs can be used to override the @@ -332,8 +331,6 @@ struct msi_domain_ops { msi_alloc_info_t *arg); void (*set_desc)(msi_alloc_info_t *arg, struct msi_desc *desc); - int (*handle_error)(struct irq_domain *domain, - struct msi_desc *desc, int error); int (*domain_alloc_irqs)(struct irq_domain *domain, struct device *dev, int nvec); void (*domain_free_irqs)(struct irq_domain *domain, -- cgit v1.2.3 From 50468e4313355b161cac8a5155a45832995b7f25 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 16 Nov 2021 18:21:16 +0200 Subject: x86/sgx: Add an attribute for the amount of SGX memory in a NUMA node == Problem == The amount of SGX memory on a system is determined by the BIOS and it varies wildly between systems. It can be as small as dozens of MB's and as large as many GB's on servers. Just like how applications need to know how much regular RAM is available, enclave builders need to know how much SGX memory an enclave can consume. == Solution == Introduce a new sysfs file: /sys/devices/system/node/nodeX/x86/sgx_total_bytes to enumerate the amount of SGX memory available in each NUMA node. This serves the same function for SGX as /proc/meminfo or /sys/devices/system/node/nodeX/meminfo does for normal RAM. 'sgx_total_bytes' is needed today to help drive the SGX selftests. SGX-specific swap code is exercised by creating overcommitted enclaves which are larger than the physical SGX memory on the system. They currently use a CPUID-based approach which can diverge from the actual amount of SGX memory available. 'sgx_total_bytes' ensures that the selftests can work efficiently and do not attempt stupid things like creating a 100,000 MB enclave on a system with 128 MB of SGX memory. == Implementation Details == Introduce CONFIG_HAVE_ARCH_NODE_DEV_GROUP opt-in flag to expose an arch specific attribute group, and add an attribute for the amount of SGX memory in bytes to each NUMA node: == ABI Design Discussion == As opposed to the per-node ABI, a single, global ABI was considered. However, this would prevent enclaves from being able to size themselves so that they fit on a single NUMA node. Essentially, a single value would rule out NUMA optimizations for enclaves. Create a new "x86/" directory inside each "nodeX/" sysfs directory. 'sgx_total_bytes' is expected to be the first of at least a few sgx-specific files to be placed in the new directory. Just scanning /proc/meminfo, these are the no-brainers that we have for RAM, but we need for SGX: MemTotal: xxxx kB // sgx_total_bytes (implemented here) MemFree: yyyy kB // sgx_free_bytes SwapTotal: zzzz kB // sgx_swapped_bytes So, at *least* three. I think we will eventually end up needing something more along the lines of a dozen. A new directory (as opposed to being in the nodeX/ "root") directory avoids cluttering the root with several "sgx_*" files. Place the new file in a new "nodeX/x86/" directory because SGX is highly x86-specific. It is very unlikely that any other architecture (or even non-Intel x86 vendor) will ever implement SGX. Using "sgx/" as opposed to "x86/" was also considered. But, there is a real chance this can get used for other arch-specific purposes. [ dhansen: rewrite changelog ] Signed-off-by: Jarkko Sakkinen Signed-off-by: Dave Hansen Acked-by: Greg Kroah-Hartman Acked-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211116162116.93081-2-jarkko@kernel.org --- include/linux/numa.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/numa.h b/include/linux/numa.h index cb44cfe2b725..59df211d051f 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -58,4 +58,8 @@ static inline int phys_to_target_node(u64 start) } #endif +#ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP +extern const struct attribute_group arch_node_dev_group; +#endif + #endif /* _LINUX_NUMA_H */ -- cgit v1.2.3 From 67b967ddd93d0ed57d392a00f6f90060f0910c0e Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 18 Nov 2021 12:46:59 +0100 Subject: mtd: Introduce an expert mode for forensics and debugging purposes When developping NAND controller drivers or when debugging filesystem corruptions, it is quite common to need hacking locally into the MTD/NAND core in order to get access to the content of the bad blocks. Instead of having multiple implementations out there let's provide a simple yet effective specific MTD-wide debugfs entry to fully disable these checks on purpose. A warning is added to inform the user when this mode gets enabled. Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20211118114659.1282855-1-miquel.raynal@bootlin.com --- include/linux/mtd/mtd.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index f5e7dfc2e4e9..1ffa933121f6 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -711,4 +711,7 @@ static inline int mtd_is_bitflip_or_eccerr(int err) { unsigned mtd_mmap_capabilities(struct mtd_info *mtd); +extern char *mtd_expert_analysis_warning; +extern bool mtd_expert_analysis_mode; + #endif /* __MTD_MTD_H__ */ -- cgit v1.2.3 From cab2d3fd6866e089b5c50db09dece131f85bfebd Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Thu, 9 Dec 2021 18:46:33 +0530 Subject: bus: mhi: core: Add support for forced PM resume For whatever reason, some devices like QCA6390, WCN6855 using ath11k are not in M3 state during PM resume, but still functional. The mhi_pm_resume should then not fail in those cases, and let the higher level device specific stack continue resuming process. Add an API mhi_pm_resume_force(), to force resuming irrespective of the current MHI state. This fixes a regression with non functional ath11k WiFi after suspend/resume cycle on some machines. Bug report: https://bugzilla.kernel.org/show_bug.cgi?id=214179 Link: https://lore.kernel.org/regressions/871r5p0x2u.fsf@codeaurora.org/ Fixes: 020d3b26c07a ("bus: mhi: Early MHI resume failure in non M3 state") Cc: stable@vger.kernel.org #5.13 Reported-by: Kalle Valo Reported-by: Pengyu Ma Tested-by: Kalle Valo Acked-by: Kalle Valo Signed-off-by: Loic Poulain [mani: Switched to API, added bug report, reported-by tags and CCed stable] Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20211209131633.4168-1-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mhi.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 723985879035..a5cc4cdf9cc8 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -663,6 +663,19 @@ int mhi_pm_suspend(struct mhi_controller *mhi_cntrl); */ int mhi_pm_resume(struct mhi_controller *mhi_cntrl); +/** + * mhi_pm_resume_force - Force resume MHI from suspended state + * @mhi_cntrl: MHI controller + * + * Resume the device irrespective of its MHI state. As per the MHI spec, devices + * has to be in M3 state during resume. But some devices seem to be in a + * different MHI state other than M3 but they continue working fine if allowed. + * This API is intented to be used for such devices. + * + * Return: 0 if the resume succeeds, a negative error code otherwise + */ +int mhi_pm_resume_force(struct mhi_controller *mhi_cntrl); + /** * mhi_download_rddm_image - Download ramdump image from device for * debugging purpose. -- cgit v1.2.3 From dc70ec217cec504e6f8fee8fd91bf5c118af05f2 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 21 Nov 2021 12:54:40 +0000 Subject: KVM: Introduce CONFIG_HAVE_KVM_DIRTY_RING I'd like to make the build include dirty_ring.c based on whether the arch wants it or not. That's a whole lot simpler if there's a config symbol instead of doing it implicitly on KVM_DIRTY_LOG_PAGE_OFFSET being set to something non-zero. Signed-off-by: David Woodhouse Message-Id: <20211121125451.9489-2-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- include/linux/kvm_dirty_ring.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h index 120e5e90fa1d..4da8d4a4140b 100644 --- a/include/linux/kvm_dirty_ring.h +++ b/include/linux/kvm_dirty_ring.h @@ -27,9 +27,9 @@ struct kvm_dirty_ring { int index; }; -#if (KVM_DIRTY_LOG_PAGE_OFFSET == 0) +#ifndef CONFIG_HAVE_KVM_DIRTY_RING /* - * If KVM_DIRTY_LOG_PAGE_OFFSET not defined, kvm_dirty_ring.o should + * If CONFIG_HAVE_HVM_DIRTY_RING not defined, kvm_dirty_ring.o should * not be included as well, so define these nop functions for the arch. */ static inline u32 kvm_dirty_ring_get_rsvd_entries(void) @@ -74,7 +74,7 @@ static inline bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring) return true; } -#else /* KVM_DIRTY_LOG_PAGE_OFFSET == 0 */ +#else /* CONFIG_HAVE_KVM_DIRTY_RING */ u32 kvm_dirty_ring_get_rsvd_entries(void); int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size); @@ -98,6 +98,6 @@ struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset); void kvm_dirty_ring_free(struct kvm_dirty_ring *ring); bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring); -#endif /* KVM_DIRTY_LOG_PAGE_OFFSET == 0 */ +#endif /* CONFIG_HAVE_KVM_DIRTY_RING */ #endif /* KVM_DIRTY_RING_H */ -- cgit v1.2.3 From 42288cb44c4b5fff7653bc392b583a2b8bd6a8c0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 8 Dec 2021 17:04:51 -0800 Subject: wait: add wake_up_pollfree() Several ->poll() implementations are special in that they use a waitqueue whose lifetime is the current task, rather than the struct file as is normally the case. This is okay for blocking polls, since a blocking poll occurs within one task; however, non-blocking polls require another solution. This solution is for the queue to be cleared before it is freed, using 'wake_up_poll(wq, EPOLLHUP | POLLFREE);'. However, that has a bug: wake_up_poll() calls __wake_up() with nr_exclusive=1. Therefore, if there are multiple "exclusive" waiters, and the wakeup function for the first one returns a positive value, only that one will be called. That's *not* what's needed for POLLFREE; POLLFREE is special in that it really needs to wake up everyone. Considering the three non-blocking poll systems: - io_uring poll doesn't handle POLLFREE at all, so it is broken anyway. - aio poll is unaffected, since it doesn't support exclusive waits. However, that's fragile, as someone could add this feature later. - epoll doesn't appear to be broken by this, since its wakeup function returns 0 when it sees POLLFREE. But this is fragile. Although there is a workaround (see epoll), it's better to define a function which always sends POLLFREE to all waiters. Add such a function. Also make it verify that the queue really becomes empty after all waiters have been woken up. Reported-by: Linus Torvalds Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211209010455.42744-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/wait.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 2d0df57c9902..851e07da2583 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -217,6 +217,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); +void __wake_up_pollfree(struct wait_queue_head *wq_head); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) @@ -245,6 +246,31 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); #define wake_up_interruptible_sync_poll_locked(x, m) \ __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) +/** + * wake_up_pollfree - signal that a polled waitqueue is going away + * @wq_head: the wait queue head + * + * In the very rare cases where a ->poll() implementation uses a waitqueue whose + * lifetime is tied to a task rather than to the 'struct file' being polled, + * this function must be called before the waitqueue is freed so that + * non-blocking polls (e.g. epoll) are notified that the queue is going away. + * + * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via + * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU. + */ +static inline void wake_up_pollfree(struct wait_queue_head *wq_head) +{ + /* + * For performance reasons, we don't always take the queue lock here. + * Therefore, we might race with someone removing the last entry from + * the queue, and proceed while they still hold the queue lock. + * However, rcu_read_lock() is required to be held in such cases, so we + * can safely proceed with an RCU-delayed free. + */ + if (waitqueue_active(wq_head)) + __wake_up_pollfree(wq_head); +} + #define ___wait_cond_timeout(condition) \ ({ \ bool __cond = (condition); \ -- cgit v1.2.3 From 6abfaaf124a81b7d2ab132cc2c9885baa14171e5 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 27 Oct 2021 16:18:45 +0200 Subject: fs_parse: allow parameter value to be empty Allow parameter value to be empty by specifying fs_param_can_be_empty flag. Signed-off-by: Lukas Czerner Cc: Al Viro Reviewed-by: Carlos Maiolino Link: https://lore.kernel.org/r/20211027141857.33657-2-lczerner@redhat.com Signed-off-by: Theodore Ts'o --- include/linux/fs_parser.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index aab0ffc6bac6..f103c91139d4 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -42,7 +42,7 @@ struct fs_parameter_spec { u8 opt; /* Option number (returned by fs_parse()) */ unsigned short flags; #define fs_param_neg_with_no 0x0002 /* "noxxx" is negative param */ -#define fs_param_neg_with_empty 0x0004 /* "xxx=" is negative param */ +#define fs_param_can_be_empty 0x0004 /* "xxx=" is allowed */ #define fs_param_deprecated 0x0008 /* The param is deprecated */ const void *data; }; -- cgit v1.2.3 From 3e5b1feccea7db576353ffc302f78d522e4116e6 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 9 Dec 2021 13:11:32 +0000 Subject: net: phylink: add legacy_pre_march2020 indicator Add a boolean to phylink_config to indicate whether a driver has not been updated for the changes in commit 7cceb599d15d ("net: phylink: avoid mac_config calls"), and thus are reliant on the old behaviour. We were currently keying the phylink behaviour on the presence of a PCS, but this is sub-optimal for modern drivers that may not have a PCS. This commit merely introduces the new flag, but does not add any use, since we need all legacy drivers to set this flag before it can be used. Once these legacy drivers have been updated, we can remove this flag. Signed-off-by: Russell King (Oracle) Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 01224235df0f..d005b8e36048 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -84,6 +84,8 @@ enum phylink_op_type { * struct phylink_config - PHYLINK configuration structure * @dev: a pointer to a struct device associated with the MAC * @type: operation type of PHYLINK instance + * @legacy_pre_march2020: driver has not been updated for March 2020 updates + * (See commit 7cceb599d15d ("net: phylink: avoid mac_config calls") * @pcs_poll: MAC PCS cannot provide link change interrupt * @poll_fixed_state: if true, starts link_poll, * if MAC link is at %MLO_AN_FIXED mode. @@ -97,6 +99,7 @@ enum phylink_op_type { struct phylink_config { struct device *dev; enum phylink_op_type type; + bool legacy_pre_march2020; bool pcs_poll; bool poll_fixed_state; bool ovr_an_inband; -- cgit v1.2.3 From 001f4261fe4d5ae710cf1f445b6cae6d9d3ae26e Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 9 Dec 2021 13:11:48 +0000 Subject: net: phylink: use legacy_pre_march2020 Use the legacy flag to indicate whether we should operate in legacy mode. This allows us to stop using the presence of a PCS as an indicator to the age of the phylink user, and make PCS presence optional. Legacy mode involves: 1) calling mac_config() whenever the link comes up 2) calling mac_config() whenever the inband advertisement changes, possibly followed by a call to mac_an_restart() 3) making use of mac_an_restart() 4) making use of mac_pcs_get_state() All the above functionality was moved to a seperate "PCS" block of operations in March 2020. Update the documents to indicate that the differences that this flag makes. Signed-off-by: Russell King (Oracle) Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index d005b8e36048..a2f266cc3442 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -190,6 +190,10 @@ void validate(struct phylink_config *config, unsigned long *supported, * negotiation completion state in @state->an_complete, and link up state * in @state->link. If possible, @state->lp_advertising should also be * populated. + * + * Note: This is a legacy method. This function will not be called unless + * legacy_pre_march2020 is set in &struct phylink_config and there is no + * PCS attached. */ void mac_pcs_get_state(struct phylink_config *config, struct phylink_link_state *state); @@ -230,6 +234,15 @@ int mac_prepare(struct phylink_config *config, unsigned int mode, * guaranteed to be correct, and so any mac_config() implementation must * never reference these fields. * + * Note: For legacy March 2020 drivers (drivers with legacy_pre_march2020 set + * in their &phylnk_config and which don't have a PCS), this function will be + * called on each link up event, and to also change the in-band advert. For + * non-legacy drivers, it will only be called to reconfigure the MAC for a + * "major" change in e.g. interface mode. It will not be called for changes + * in speed, duplex or pause modes or to change the in-band advertisement. + * In any case, it is strongly preferred that speed, duplex and pause settings + * are handled in the mac_link_up() method and not in this method. + * * (this requires a rewrite - please refer to mac_link_up() for situations * where the PCS and MAC are not tightly integrated.) * @@ -314,6 +327,10 @@ int mac_finish(struct phylink_config *config, unsigned int mode, /** * mac_an_restart() - restart 802.3z BaseX autonegotiation * @config: a pointer to a &struct phylink_config. + * + * Note: This is a legacy method. This function will not be called unless + * legacy_pre_march2020 is set in &struct phylink_config and there is no + * PCS attached. */ void mac_an_restart(struct phylink_config *config); -- cgit v1.2.3 From 1a2fb220edca98d18f90e3ef5bd6853a6b22b1b8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 6 Dec 2021 22:27:58 -0800 Subject: skbuff: Extract list pointers to silence compiler warnings Under both -Warray-bounds and the object_size sanitizer, the compiler is upset about accessing prev/next of sk_buff when the object it thinks it is coming from is sk_buff_head. The warning is a false positive due to the compiler taking a conservative approach, opting to warn at casting time rather than access time. However, in support of enabling -Warray-bounds globally (which has found many real bugs), arrange things for sk_buff so that the compiler can unambiguously see that there is no intention to access anything except prev/next. Introduce and cast to a separate struct sk_buff_list, which contains _only_ the first two fields, silencing the warnings: In file included from ./include/net/net_namespace.h:39, from ./include/linux/netdevice.h:37, from net/core/netpoll.c:17: net/core/netpoll.c: In function 'refill_skbs': ./include/linux/skbuff.h:2086:9: warning: array subscript 'struct sk_buff[0]' is partly outside array bounds of 'struct sk_buff_head[1]' [-Warray-bounds] 2086 | __skb_insert(newsk, next->prev, next, list); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ net/core/netpoll.c:49:28: note: while referencing 'skb_pool' 49 | static struct sk_buff_head skb_pool; | ^~~~~~~~ This change results in no executable instruction differences. Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20211207062758.2324338-1-keescook@chromium.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index dd262bd8ddbe..6535294f6a48 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -292,9 +292,11 @@ struct tc_skb_ext { #endif struct sk_buff_head { - /* These two members must be first. */ - struct sk_buff *next; - struct sk_buff *prev; + /* These two members must be first to match sk_buff. */ + struct_group_tagged(sk_buff_list, list, + struct sk_buff *next; + struct sk_buff *prev; + ); __u32 qlen; spinlock_t lock; @@ -730,7 +732,7 @@ typedef unsigned char *sk_buff_data_t; struct sk_buff { union { struct { - /* These two members must be first. */ + /* These two members must be first to match sk_buff_head. */ struct sk_buff *next; struct sk_buff *prev; @@ -1976,8 +1978,8 @@ static inline void __skb_insert(struct sk_buff *newsk, */ WRITE_ONCE(newsk->next, next); WRITE_ONCE(newsk->prev, prev); - WRITE_ONCE(next->prev, newsk); - WRITE_ONCE(prev->next, newsk); + WRITE_ONCE(((struct sk_buff_list *)next)->prev, newsk); + WRITE_ONCE(((struct sk_buff_list *)prev)->next, newsk); WRITE_ONCE(list->qlen, list->qlen + 1); } @@ -2073,7 +2075,7 @@ static inline void __skb_queue_after(struct sk_buff_head *list, struct sk_buff *prev, struct sk_buff *newsk) { - __skb_insert(newsk, prev, prev->next, list); + __skb_insert(newsk, prev, ((struct sk_buff_list *)prev)->next, list); } void skb_append(struct sk_buff *old, struct sk_buff *newsk, @@ -2083,7 +2085,7 @@ static inline void __skb_queue_before(struct sk_buff_head *list, struct sk_buff *next, struct sk_buff *newsk) { - __skb_insert(newsk, next->prev, next, list); + __skb_insert(newsk, ((struct sk_buff_list *)next)->prev, next, list); } /** -- cgit v1.2.3 From a4f1192cb53758a7210ed5a9ee695aeba22f75fb Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 9 Dec 2021 14:30:33 +0200 Subject: percpu_ref: Replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. Signed-off-by: Andy Shevchenko Signed-off-by: Dennis Zhou --- include/linux/percpu-refcount.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index b31d3f3312ce..d73a1c08c3e3 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -51,9 +51,9 @@ #define _LINUX_PERCPU_REFCOUNT_H #include -#include #include #include +#include #include struct percpu_ref; -- cgit v1.2.3 From 9756f64c8f2d19c0029a5827bda8ac275302ec22 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 30 Nov 2021 12:44:11 +0100 Subject: kcsan: Avoid checking scoped accesses from nested contexts Avoid checking scoped accesses from nested contexts (such as nested interrupts or in scheduler code) which share the same kcsan_ctx. This is to avoid detecting false positive races of accesses in the same thread with currently scoped accesses: consider setting up a watchpoint for a non-scoped (normal) access that also "conflicts" with a current scoped access. In a nested interrupt (or in the scheduler), which shares the same kcsan_ctx, we cannot check scoped accesses set up in the parent context -- simply ignore them in this case. With the introduction of kcsan_ctx::disable_scoped, we can also clean up kcsan_check_scoped_accesses()'s recursion guard, and do not need to modify the list's prev pointer. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/kcsan.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kcsan.h b/include/linux/kcsan.h index fc266ecb2a4d..13cef3458fed 100644 --- a/include/linux/kcsan.h +++ b/include/linux/kcsan.h @@ -21,6 +21,7 @@ */ struct kcsan_ctx { int disable_count; /* disable counter */ + int disable_scoped; /* disable scoped access counter */ int atomic_next; /* number of following atomic ops */ /* -- cgit v1.2.3 From 69562e4983d93e2791c0bf128b07462afbd7f4dc Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 5 Aug 2021 14:57:45 +0200 Subject: kcsan: Add core support for a subset of weak memory modeling Add support for modeling a subset of weak memory, which will enable detection of a subset of data races due to missing memory barriers. KCSAN's approach to detecting missing memory barriers is based on modeling access reordering, and enabled if `CONFIG_KCSAN_WEAK_MEMORY=y`, which depends on `CONFIG_KCSAN_STRICT=y`. The feature can be enabled or disabled at boot and runtime via the `kcsan.weak_memory` boot parameter. Each memory access for which a watchpoint is set up, is also selected for simulated reordering within the scope of its function (at most 1 in-flight access). We are limited to modeling the effects of "buffering" (delaying the access), since the runtime cannot "prefetch" accesses (therefore no acquire modeling). Once an access has been selected for reordering, it is checked along every other access until the end of the function scope. If an appropriate memory barrier is encountered, the access will no longer be considered for reordering. When the result of a memory operation should be ordered by a barrier, KCSAN can then detect data races where the conflict only occurs as a result of a missing barrier due to reordering accesses. Suggested-by: Dmitry Vyukov Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/kcsan-checks.h | 10 +++++++++- include/linux/kcsan.h | 10 +++++++++- include/linux/sched.h | 3 +++ 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index 5f5965246877..a1c6a89fde71 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -99,7 +99,15 @@ void kcsan_set_access_mask(unsigned long mask); /* Scoped access information. */ struct kcsan_scoped_access { - struct list_head list; + union { + struct list_head list; /* scoped_accesses list */ + /* + * Not an entry in scoped_accesses list; stack depth from where + * the access was initialized. + */ + int stack_depth; + }; + /* Access information. */ const volatile void *ptr; size_t size; diff --git a/include/linux/kcsan.h b/include/linux/kcsan.h index 13cef3458fed..c07c71f5ba4f 100644 --- a/include/linux/kcsan.h +++ b/include/linux/kcsan.h @@ -49,8 +49,16 @@ struct kcsan_ctx { */ unsigned long access_mask; - /* List of scoped accesses. */ + /* List of scoped accesses; likely to be empty. */ struct list_head scoped_accesses; + +#ifdef CONFIG_KCSAN_WEAK_MEMORY + /* + * Scoped access for modeling access reordering to detect missing memory + * barriers; only keep 1 to keep fast-path complexity manageable. + */ + struct kcsan_scoped_access reorder_access; +#endif }; /** diff --git a/include/linux/sched.h b/include/linux/sched.h index 78c351e35fec..0cd40b010487 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1339,6 +1339,9 @@ struct task_struct { #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events kcsan_save_irqtrace; #endif +#ifdef CONFIG_KCSAN_WEAK_MEMORY + int kcsan_stack_depth; +#endif #endif #if IS_ENABLED(CONFIG_KUNIT) -- cgit v1.2.3 From 0b8b0830ac1419d7250fde31ea78793a03f3db44 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 30 Nov 2021 12:44:13 +0100 Subject: kcsan: Add core memory barrier instrumentation functions Add the core memory barrier instrumentation functions. These invalidate the current in-flight reordered access based on the rules for the respective barrier types and in-flight access type. To obtain barrier instrumentation that can be disabled via __no_kcsan with appropriate compiler-support (and not just with objtool help), barrier instrumentation repurposes __atomic_signal_fence(), instead of inserting explicit calls. Crucially, __atomic_signal_fence() normally does not map to any real instructions, but is still intercepted by fsanitize=thread. As a result, like any other instrumentation done by the compiler, barrier instrumentation can be disabled with __no_kcsan. Unfortunately Clang and GCC currently differ in their __no_kcsan aka __no_sanitize_thread behaviour with respect to builtin atomics (and __tsan_func_{entry,exit}) instrumentation. This is already reflected in Kconfig.kcsan's dependencies for KCSAN_WEAK_MEMORY. A later change will introduce support for newer versions of Clang that can implement __no_kcsan to also remove the additional instrumentation introduced by KCSAN_WEAK_MEMORY. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/kcsan-checks.h | 71 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index a1c6a89fde71..9d2c869167f2 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -36,6 +36,36 @@ */ void __kcsan_check_access(const volatile void *ptr, size_t size, int type); +/* + * See definition of __tsan_atomic_signal_fence() in kernel/kcsan/core.c. + * Note: The mappings are arbitrary, and do not reflect any real mappings of C11 + * memory orders to the LKMM memory orders and vice-versa! + */ +#define __KCSAN_BARRIER_TO_SIGNAL_FENCE_mb __ATOMIC_SEQ_CST +#define __KCSAN_BARRIER_TO_SIGNAL_FENCE_wmb __ATOMIC_ACQ_REL +#define __KCSAN_BARRIER_TO_SIGNAL_FENCE_rmb __ATOMIC_ACQUIRE +#define __KCSAN_BARRIER_TO_SIGNAL_FENCE_release __ATOMIC_RELEASE + +/** + * __kcsan_mb - full memory barrier instrumentation + */ +void __kcsan_mb(void); + +/** + * __kcsan_wmb - write memory barrier instrumentation + */ +void __kcsan_wmb(void); + +/** + * __kcsan_rmb - read memory barrier instrumentation + */ +void __kcsan_rmb(void); + +/** + * __kcsan_release - release barrier instrumentation + */ +void __kcsan_release(void); + /** * kcsan_disable_current - disable KCSAN for the current context * @@ -159,6 +189,10 @@ void kcsan_end_scoped_access(struct kcsan_scoped_access *sa); static inline void __kcsan_check_access(const volatile void *ptr, size_t size, int type) { } +static inline void __kcsan_mb(void) { } +static inline void __kcsan_wmb(void) { } +static inline void __kcsan_rmb(void) { } +static inline void __kcsan_release(void) { } static inline void kcsan_disable_current(void) { } static inline void kcsan_enable_current(void) { } static inline void kcsan_enable_current_nowarn(void) { } @@ -191,12 +225,45 @@ static inline void kcsan_end_scoped_access(struct kcsan_scoped_access *sa) { } */ #define __kcsan_disable_current kcsan_disable_current #define __kcsan_enable_current kcsan_enable_current_nowarn -#else +#else /* __SANITIZE_THREAD__ */ static inline void kcsan_check_access(const volatile void *ptr, size_t size, int type) { } static inline void __kcsan_enable_current(void) { } static inline void __kcsan_disable_current(void) { } -#endif +#endif /* __SANITIZE_THREAD__ */ + +#if defined(CONFIG_KCSAN_WEAK_MEMORY) && defined(__SANITIZE_THREAD__) +/* + * Normal barrier instrumentation is not done via explicit calls, but by mapping + * to a repurposed __atomic_signal_fence(), which normally does not generate any + * real instructions, but is still intercepted by fsanitize=thread. This means, + * like any other compile-time instrumentation, barrier instrumentation can be + * disabled with the __no_kcsan function attribute. + * + * Also see definition of __tsan_atomic_signal_fence() in kernel/kcsan/core.c. + */ +#define __KCSAN_BARRIER_TO_SIGNAL_FENCE(name) \ + static __always_inline void kcsan_##name(void) \ + { \ + barrier(); \ + __atomic_signal_fence(__KCSAN_BARRIER_TO_SIGNAL_FENCE_##name); \ + barrier(); \ + } +__KCSAN_BARRIER_TO_SIGNAL_FENCE(mb) +__KCSAN_BARRIER_TO_SIGNAL_FENCE(wmb) +__KCSAN_BARRIER_TO_SIGNAL_FENCE(rmb) +__KCSAN_BARRIER_TO_SIGNAL_FENCE(release) +#elif defined(CONFIG_KCSAN_WEAK_MEMORY) && defined(__KCSAN_INSTRUMENT_BARRIERS__) +#define kcsan_mb __kcsan_mb +#define kcsan_wmb __kcsan_wmb +#define kcsan_rmb __kcsan_rmb +#define kcsan_release __kcsan_release +#else /* CONFIG_KCSAN_WEAK_MEMORY && ... */ +static inline void kcsan_mb(void) { } +static inline void kcsan_wmb(void) { } +static inline void kcsan_rmb(void) { } +static inline void kcsan_release(void) { } +#endif /* CONFIG_KCSAN_WEAK_MEMORY && ... */ /** * __kcsan_check_read - check regular read access for races -- cgit v1.2.3 From f948666de517cf8ebef7cb2c9b2d669dec4bfe2e Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 30 Nov 2021 12:44:22 +0100 Subject: locking/barriers, kcsan: Add instrumentation for barriers Adds the required KCSAN instrumentation for barriers if CONFIG_SMP. KCSAN supports modeling the effects of: smp_mb() smp_rmb() smp_wmb() smp_store_release() Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/spinlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index b4e5ca23f840..5c0c5174155d 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -171,7 +171,7 @@ do { \ * Architectures that can implement ACQUIRE better need to take care. */ #ifndef smp_mb__after_spinlock -#define smp_mb__after_spinlock() do { } while (0) +#define smp_mb__after_spinlock() kcsan_mb() #endif #ifdef CONFIG_DEBUG_SPINLOCK -- cgit v1.2.3 From e87c4f6642f49627c3430cb3ee78c73fb51b48e4 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 30 Nov 2021 12:44:24 +0100 Subject: locking/atomics, kcsan: Add instrumentation for barriers Adds the required KCSAN instrumentation for barriers of atomics. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/atomic/atomic-instrumented.h | 135 ++++++++++++++++++++++++++++- 1 file changed, 134 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h index a0f654370da3..5d69b143c28e 100644 --- a/include/linux/atomic/atomic-instrumented.h +++ b/include/linux/atomic/atomic-instrumented.h @@ -45,6 +45,7 @@ atomic_set(atomic_t *v, int i) static __always_inline void atomic_set_release(atomic_t *v, int i) { + kcsan_release(); instrument_atomic_write(v, sizeof(*v)); arch_atomic_set_release(v, i); } @@ -59,6 +60,7 @@ atomic_add(int i, atomic_t *v) static __always_inline int atomic_add_return(int i, atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_add_return(i, v); } @@ -73,6 +75,7 @@ atomic_add_return_acquire(int i, atomic_t *v) static __always_inline int atomic_add_return_release(int i, atomic_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_add_return_release(i, v); } @@ -87,6 +90,7 @@ atomic_add_return_relaxed(int i, atomic_t *v) static __always_inline int atomic_fetch_add(int i, atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_add(i, v); } @@ -101,6 +105,7 @@ atomic_fetch_add_acquire(int i, atomic_t *v) static __always_inline int atomic_fetch_add_release(int i, atomic_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_add_release(i, v); } @@ -122,6 +127,7 @@ atomic_sub(int i, atomic_t *v) static __always_inline int atomic_sub_return(int i, atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_sub_return(i, v); } @@ -136,6 +142,7 @@ atomic_sub_return_acquire(int i, atomic_t *v) static __always_inline int atomic_sub_return_release(int i, atomic_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_sub_return_release(i, v); } @@ -150,6 +157,7 @@ atomic_sub_return_relaxed(int i, atomic_t *v) static __always_inline int atomic_fetch_sub(int i, atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_sub(i, v); } @@ -164,6 +172,7 @@ atomic_fetch_sub_acquire(int i, atomic_t *v) static __always_inline int atomic_fetch_sub_release(int i, atomic_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_sub_release(i, v); } @@ -185,6 +194,7 @@ atomic_inc(atomic_t *v) static __always_inline int atomic_inc_return(atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_return(v); } @@ -199,6 +209,7 @@ atomic_inc_return_acquire(atomic_t *v) static __always_inline int atomic_inc_return_release(atomic_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_return_release(v); } @@ -213,6 +224,7 @@ atomic_inc_return_relaxed(atomic_t *v) static __always_inline int atomic_fetch_inc(atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_inc(v); } @@ -227,6 +239,7 @@ atomic_fetch_inc_acquire(atomic_t *v) static __always_inline int atomic_fetch_inc_release(atomic_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_inc_release(v); } @@ -248,6 +261,7 @@ atomic_dec(atomic_t *v) static __always_inline int atomic_dec_return(atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_return(v); } @@ -262,6 +276,7 @@ atomic_dec_return_acquire(atomic_t *v) static __always_inline int atomic_dec_return_release(atomic_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_return_release(v); } @@ -276,6 +291,7 @@ atomic_dec_return_relaxed(atomic_t *v) static __always_inline int atomic_fetch_dec(atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_dec(v); } @@ -290,6 +306,7 @@ atomic_fetch_dec_acquire(atomic_t *v) static __always_inline int atomic_fetch_dec_release(atomic_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_dec_release(v); } @@ -311,6 +328,7 @@ atomic_and(int i, atomic_t *v) static __always_inline int atomic_fetch_and(int i, atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_and(i, v); } @@ -325,6 +343,7 @@ atomic_fetch_and_acquire(int i, atomic_t *v) static __always_inline int atomic_fetch_and_release(int i, atomic_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_and_release(i, v); } @@ -346,6 +365,7 @@ atomic_andnot(int i, atomic_t *v) static __always_inline int atomic_fetch_andnot(int i, atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_andnot(i, v); } @@ -360,6 +380,7 @@ atomic_fetch_andnot_acquire(int i, atomic_t *v) static __always_inline int atomic_fetch_andnot_release(int i, atomic_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_andnot_release(i, v); } @@ -381,6 +402,7 @@ atomic_or(int i, atomic_t *v) static __always_inline int atomic_fetch_or(int i, atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_or(i, v); } @@ -395,6 +417,7 @@ atomic_fetch_or_acquire(int i, atomic_t *v) static __always_inline int atomic_fetch_or_release(int i, atomic_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_or_release(i, v); } @@ -416,6 +439,7 @@ atomic_xor(int i, atomic_t *v) static __always_inline int atomic_fetch_xor(int i, atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_xor(i, v); } @@ -430,6 +454,7 @@ atomic_fetch_xor_acquire(int i, atomic_t *v) static __always_inline int atomic_fetch_xor_release(int i, atomic_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_xor_release(i, v); } @@ -444,6 +469,7 @@ atomic_fetch_xor_relaxed(int i, atomic_t *v) static __always_inline int atomic_xchg(atomic_t *v, int i) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_xchg(v, i); } @@ -458,6 +484,7 @@ atomic_xchg_acquire(atomic_t *v, int i) static __always_inline int atomic_xchg_release(atomic_t *v, int i) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_xchg_release(v, i); } @@ -472,6 +499,7 @@ atomic_xchg_relaxed(atomic_t *v, int i) static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_cmpxchg(v, old, new); } @@ -486,6 +514,7 @@ atomic_cmpxchg_acquire(atomic_t *v, int old, int new) static __always_inline int atomic_cmpxchg_release(atomic_t *v, int old, int new) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_cmpxchg_release(v, old, new); } @@ -500,6 +529,7 @@ atomic_cmpxchg_relaxed(atomic_t *v, int old, int new) static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic_try_cmpxchg(v, old, new); @@ -516,6 +546,7 @@ atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) static __always_inline bool atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic_try_cmpxchg_release(v, old, new); @@ -532,6 +563,7 @@ atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) static __always_inline bool atomic_sub_and_test(int i, atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_sub_and_test(i, v); } @@ -539,6 +571,7 @@ atomic_sub_and_test(int i, atomic_t *v) static __always_inline bool atomic_dec_and_test(atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_and_test(v); } @@ -546,6 +579,7 @@ atomic_dec_and_test(atomic_t *v) static __always_inline bool atomic_inc_and_test(atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_and_test(v); } @@ -553,6 +587,7 @@ atomic_inc_and_test(atomic_t *v) static __always_inline bool atomic_add_negative(int i, atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_add_negative(i, v); } @@ -560,6 +595,7 @@ atomic_add_negative(int i, atomic_t *v) static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_add_unless(v, a, u); } @@ -567,6 +603,7 @@ atomic_fetch_add_unless(atomic_t *v, int a, int u) static __always_inline bool atomic_add_unless(atomic_t *v, int a, int u) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_add_unless(v, a, u); } @@ -574,6 +611,7 @@ atomic_add_unless(atomic_t *v, int a, int u) static __always_inline bool atomic_inc_not_zero(atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_not_zero(v); } @@ -581,6 +619,7 @@ atomic_inc_not_zero(atomic_t *v) static __always_inline bool atomic_inc_unless_negative(atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_unless_negative(v); } @@ -588,6 +627,7 @@ atomic_inc_unless_negative(atomic_t *v) static __always_inline bool atomic_dec_unless_positive(atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_unless_positive(v); } @@ -595,6 +635,7 @@ atomic_dec_unless_positive(atomic_t *v) static __always_inline int atomic_dec_if_positive(atomic_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_if_positive(v); } @@ -623,6 +664,7 @@ atomic64_set(atomic64_t *v, s64 i) static __always_inline void atomic64_set_release(atomic64_t *v, s64 i) { + kcsan_release(); instrument_atomic_write(v, sizeof(*v)); arch_atomic64_set_release(v, i); } @@ -637,6 +679,7 @@ atomic64_add(s64 i, atomic64_t *v) static __always_inline s64 atomic64_add_return(s64 i, atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_add_return(i, v); } @@ -651,6 +694,7 @@ atomic64_add_return_acquire(s64 i, atomic64_t *v) static __always_inline s64 atomic64_add_return_release(s64 i, atomic64_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_add_return_release(i, v); } @@ -665,6 +709,7 @@ atomic64_add_return_relaxed(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_add(s64 i, atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_add(i, v); } @@ -679,6 +724,7 @@ atomic64_fetch_add_acquire(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_add_release(s64 i, atomic64_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_add_release(i, v); } @@ -700,6 +746,7 @@ atomic64_sub(s64 i, atomic64_t *v) static __always_inline s64 atomic64_sub_return(s64 i, atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_sub_return(i, v); } @@ -714,6 +761,7 @@ atomic64_sub_return_acquire(s64 i, atomic64_t *v) static __always_inline s64 atomic64_sub_return_release(s64 i, atomic64_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_sub_return_release(i, v); } @@ -728,6 +776,7 @@ atomic64_sub_return_relaxed(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_sub(s64 i, atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_sub(i, v); } @@ -742,6 +791,7 @@ atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_sub_release(s64 i, atomic64_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_sub_release(i, v); } @@ -763,6 +813,7 @@ atomic64_inc(atomic64_t *v) static __always_inline s64 atomic64_inc_return(atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_return(v); } @@ -777,6 +828,7 @@ atomic64_inc_return_acquire(atomic64_t *v) static __always_inline s64 atomic64_inc_return_release(atomic64_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_return_release(v); } @@ -791,6 +843,7 @@ atomic64_inc_return_relaxed(atomic64_t *v) static __always_inline s64 atomic64_fetch_inc(atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_inc(v); } @@ -805,6 +858,7 @@ atomic64_fetch_inc_acquire(atomic64_t *v) static __always_inline s64 atomic64_fetch_inc_release(atomic64_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_inc_release(v); } @@ -826,6 +880,7 @@ atomic64_dec(atomic64_t *v) static __always_inline s64 atomic64_dec_return(atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_return(v); } @@ -840,6 +895,7 @@ atomic64_dec_return_acquire(atomic64_t *v) static __always_inline s64 atomic64_dec_return_release(atomic64_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_return_release(v); } @@ -854,6 +910,7 @@ atomic64_dec_return_relaxed(atomic64_t *v) static __always_inline s64 atomic64_fetch_dec(atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_dec(v); } @@ -868,6 +925,7 @@ atomic64_fetch_dec_acquire(atomic64_t *v) static __always_inline s64 atomic64_fetch_dec_release(atomic64_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_dec_release(v); } @@ -889,6 +947,7 @@ atomic64_and(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_and(s64 i, atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_and(i, v); } @@ -903,6 +962,7 @@ atomic64_fetch_and_acquire(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_and_release(s64 i, atomic64_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_and_release(i, v); } @@ -924,6 +984,7 @@ atomic64_andnot(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_andnot(s64 i, atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_andnot(i, v); } @@ -938,6 +999,7 @@ atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_andnot_release(s64 i, atomic64_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_andnot_release(i, v); } @@ -959,6 +1021,7 @@ atomic64_or(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_or(s64 i, atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_or(i, v); } @@ -973,6 +1036,7 @@ atomic64_fetch_or_acquire(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_or_release(s64 i, atomic64_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_or_release(i, v); } @@ -994,6 +1058,7 @@ atomic64_xor(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_xor(s64 i, atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_xor(i, v); } @@ -1008,6 +1073,7 @@ atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_xor_release(s64 i, atomic64_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_xor_release(i, v); } @@ -1022,6 +1088,7 @@ atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v) static __always_inline s64 atomic64_xchg(atomic64_t *v, s64 i) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_xchg(v, i); } @@ -1036,6 +1103,7 @@ atomic64_xchg_acquire(atomic64_t *v, s64 i) static __always_inline s64 atomic64_xchg_release(atomic64_t *v, s64 i) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_xchg_release(v, i); } @@ -1050,6 +1118,7 @@ atomic64_xchg_relaxed(atomic64_t *v, s64 i) static __always_inline s64 atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_cmpxchg(v, old, new); } @@ -1064,6 +1133,7 @@ atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) static __always_inline s64 atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_cmpxchg_release(v, old, new); } @@ -1078,6 +1148,7 @@ atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new) static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic64_try_cmpxchg(v, old, new); @@ -1094,6 +1165,7 @@ atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) static __always_inline bool atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic64_try_cmpxchg_release(v, old, new); @@ -1110,6 +1182,7 @@ atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) static __always_inline bool atomic64_sub_and_test(s64 i, atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_sub_and_test(i, v); } @@ -1117,6 +1190,7 @@ atomic64_sub_and_test(s64 i, atomic64_t *v) static __always_inline bool atomic64_dec_and_test(atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_and_test(v); } @@ -1124,6 +1198,7 @@ atomic64_dec_and_test(atomic64_t *v) static __always_inline bool atomic64_inc_and_test(atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_and_test(v); } @@ -1131,6 +1206,7 @@ atomic64_inc_and_test(atomic64_t *v) static __always_inline bool atomic64_add_negative(s64 i, atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_add_negative(i, v); } @@ -1138,6 +1214,7 @@ atomic64_add_negative(s64 i, atomic64_t *v) static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_add_unless(v, a, u); } @@ -1145,6 +1222,7 @@ atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) static __always_inline bool atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_add_unless(v, a, u); } @@ -1152,6 +1230,7 @@ atomic64_add_unless(atomic64_t *v, s64 a, s64 u) static __always_inline bool atomic64_inc_not_zero(atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_not_zero(v); } @@ -1159,6 +1238,7 @@ atomic64_inc_not_zero(atomic64_t *v) static __always_inline bool atomic64_inc_unless_negative(atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_unless_negative(v); } @@ -1166,6 +1246,7 @@ atomic64_inc_unless_negative(atomic64_t *v) static __always_inline bool atomic64_dec_unless_positive(atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_unless_positive(v); } @@ -1173,6 +1254,7 @@ atomic64_dec_unless_positive(atomic64_t *v) static __always_inline s64 atomic64_dec_if_positive(atomic64_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_if_positive(v); } @@ -1201,6 +1283,7 @@ atomic_long_set(atomic_long_t *v, long i) static __always_inline void atomic_long_set_release(atomic_long_t *v, long i) { + kcsan_release(); instrument_atomic_write(v, sizeof(*v)); arch_atomic_long_set_release(v, i); } @@ -1215,6 +1298,7 @@ atomic_long_add(long i, atomic_long_t *v) static __always_inline long atomic_long_add_return(long i, atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_add_return(i, v); } @@ -1229,6 +1313,7 @@ atomic_long_add_return_acquire(long i, atomic_long_t *v) static __always_inline long atomic_long_add_return_release(long i, atomic_long_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_add_return_release(i, v); } @@ -1243,6 +1328,7 @@ atomic_long_add_return_relaxed(long i, atomic_long_t *v) static __always_inline long atomic_long_fetch_add(long i, atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_add(i, v); } @@ -1257,6 +1343,7 @@ atomic_long_fetch_add_acquire(long i, atomic_long_t *v) static __always_inline long atomic_long_fetch_add_release(long i, atomic_long_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_add_release(i, v); } @@ -1278,6 +1365,7 @@ atomic_long_sub(long i, atomic_long_t *v) static __always_inline long atomic_long_sub_return(long i, atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_sub_return(i, v); } @@ -1292,6 +1380,7 @@ atomic_long_sub_return_acquire(long i, atomic_long_t *v) static __always_inline long atomic_long_sub_return_release(long i, atomic_long_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_sub_return_release(i, v); } @@ -1306,6 +1395,7 @@ atomic_long_sub_return_relaxed(long i, atomic_long_t *v) static __always_inline long atomic_long_fetch_sub(long i, atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_sub(i, v); } @@ -1320,6 +1410,7 @@ atomic_long_fetch_sub_acquire(long i, atomic_long_t *v) static __always_inline long atomic_long_fetch_sub_release(long i, atomic_long_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_sub_release(i, v); } @@ -1341,6 +1432,7 @@ atomic_long_inc(atomic_long_t *v) static __always_inline long atomic_long_inc_return(atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_inc_return(v); } @@ -1355,6 +1447,7 @@ atomic_long_inc_return_acquire(atomic_long_t *v) static __always_inline long atomic_long_inc_return_release(atomic_long_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_inc_return_release(v); } @@ -1369,6 +1462,7 @@ atomic_long_inc_return_relaxed(atomic_long_t *v) static __always_inline long atomic_long_fetch_inc(atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_inc(v); } @@ -1383,6 +1477,7 @@ atomic_long_fetch_inc_acquire(atomic_long_t *v) static __always_inline long atomic_long_fetch_inc_release(atomic_long_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_inc_release(v); } @@ -1404,6 +1499,7 @@ atomic_long_dec(atomic_long_t *v) static __always_inline long atomic_long_dec_return(atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_dec_return(v); } @@ -1418,6 +1514,7 @@ atomic_long_dec_return_acquire(atomic_long_t *v) static __always_inline long atomic_long_dec_return_release(atomic_long_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_dec_return_release(v); } @@ -1432,6 +1529,7 @@ atomic_long_dec_return_relaxed(atomic_long_t *v) static __always_inline long atomic_long_fetch_dec(atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_dec(v); } @@ -1446,6 +1544,7 @@ atomic_long_fetch_dec_acquire(atomic_long_t *v) static __always_inline long atomic_long_fetch_dec_release(atomic_long_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_dec_release(v); } @@ -1467,6 +1566,7 @@ atomic_long_and(long i, atomic_long_t *v) static __always_inline long atomic_long_fetch_and(long i, atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_and(i, v); } @@ -1481,6 +1581,7 @@ atomic_long_fetch_and_acquire(long i, atomic_long_t *v) static __always_inline long atomic_long_fetch_and_release(long i, atomic_long_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_and_release(i, v); } @@ -1502,6 +1603,7 @@ atomic_long_andnot(long i, atomic_long_t *v) static __always_inline long atomic_long_fetch_andnot(long i, atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_andnot(i, v); } @@ -1516,6 +1618,7 @@ atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v) static __always_inline long atomic_long_fetch_andnot_release(long i, atomic_long_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_andnot_release(i, v); } @@ -1537,6 +1640,7 @@ atomic_long_or(long i, atomic_long_t *v) static __always_inline long atomic_long_fetch_or(long i, atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_or(i, v); } @@ -1551,6 +1655,7 @@ atomic_long_fetch_or_acquire(long i, atomic_long_t *v) static __always_inline long atomic_long_fetch_or_release(long i, atomic_long_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_or_release(i, v); } @@ -1572,6 +1677,7 @@ atomic_long_xor(long i, atomic_long_t *v) static __always_inline long atomic_long_fetch_xor(long i, atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_xor(i, v); } @@ -1586,6 +1692,7 @@ atomic_long_fetch_xor_acquire(long i, atomic_long_t *v) static __always_inline long atomic_long_fetch_xor_release(long i, atomic_long_t *v) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_xor_release(i, v); } @@ -1600,6 +1707,7 @@ atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v) static __always_inline long atomic_long_xchg(atomic_long_t *v, long i) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_xchg(v, i); } @@ -1614,6 +1722,7 @@ atomic_long_xchg_acquire(atomic_long_t *v, long i) static __always_inline long atomic_long_xchg_release(atomic_long_t *v, long i) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_xchg_release(v, i); } @@ -1628,6 +1737,7 @@ atomic_long_xchg_relaxed(atomic_long_t *v, long i) static __always_inline long atomic_long_cmpxchg(atomic_long_t *v, long old, long new) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_cmpxchg(v, old, new); } @@ -1642,6 +1752,7 @@ atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new) static __always_inline long atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_cmpxchg_release(v, old, new); } @@ -1656,6 +1767,7 @@ atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new) static __always_inline bool atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic_long_try_cmpxchg(v, old, new); @@ -1672,6 +1784,7 @@ atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new) static __always_inline bool atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new) { + kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic_long_try_cmpxchg_release(v, old, new); @@ -1688,6 +1801,7 @@ atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) static __always_inline bool atomic_long_sub_and_test(long i, atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_sub_and_test(i, v); } @@ -1695,6 +1809,7 @@ atomic_long_sub_and_test(long i, atomic_long_t *v) static __always_inline bool atomic_long_dec_and_test(atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_dec_and_test(v); } @@ -1702,6 +1817,7 @@ atomic_long_dec_and_test(atomic_long_t *v) static __always_inline bool atomic_long_inc_and_test(atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_inc_and_test(v); } @@ -1709,6 +1825,7 @@ atomic_long_inc_and_test(atomic_long_t *v) static __always_inline bool atomic_long_add_negative(long i, atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_add_negative(i, v); } @@ -1716,6 +1833,7 @@ atomic_long_add_negative(long i, atomic_long_t *v) static __always_inline long atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_add_unless(v, a, u); } @@ -1723,6 +1841,7 @@ atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) static __always_inline bool atomic_long_add_unless(atomic_long_t *v, long a, long u) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_add_unless(v, a, u); } @@ -1730,6 +1849,7 @@ atomic_long_add_unless(atomic_long_t *v, long a, long u) static __always_inline bool atomic_long_inc_not_zero(atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_inc_not_zero(v); } @@ -1737,6 +1857,7 @@ atomic_long_inc_not_zero(atomic_long_t *v) static __always_inline bool atomic_long_inc_unless_negative(atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_inc_unless_negative(v); } @@ -1744,6 +1865,7 @@ atomic_long_inc_unless_negative(atomic_long_t *v) static __always_inline bool atomic_long_dec_unless_positive(atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_dec_unless_positive(v); } @@ -1751,6 +1873,7 @@ atomic_long_dec_unless_positive(atomic_long_t *v) static __always_inline long atomic_long_dec_if_positive(atomic_long_t *v) { + kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_dec_if_positive(v); } @@ -1758,6 +1881,7 @@ atomic_long_dec_if_positive(atomic_long_t *v) #define xchg(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ + kcsan_mb(); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_xchg(__ai_ptr, __VA_ARGS__); \ }) @@ -1772,6 +1896,7 @@ atomic_long_dec_if_positive(atomic_long_t *v) #define xchg_release(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ + kcsan_release(); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_xchg_release(__ai_ptr, __VA_ARGS__); \ }) @@ -1786,6 +1911,7 @@ atomic_long_dec_if_positive(atomic_long_t *v) #define cmpxchg(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ + kcsan_mb(); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_cmpxchg(__ai_ptr, __VA_ARGS__); \ }) @@ -1800,6 +1926,7 @@ atomic_long_dec_if_positive(atomic_long_t *v) #define cmpxchg_release(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ + kcsan_release(); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_cmpxchg_release(__ai_ptr, __VA_ARGS__); \ }) @@ -1814,6 +1941,7 @@ atomic_long_dec_if_positive(atomic_long_t *v) #define cmpxchg64(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ + kcsan_mb(); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_cmpxchg64(__ai_ptr, __VA_ARGS__); \ }) @@ -1828,6 +1956,7 @@ atomic_long_dec_if_positive(atomic_long_t *v) #define cmpxchg64_release(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ + kcsan_release(); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_cmpxchg64_release(__ai_ptr, __VA_ARGS__); \ }) @@ -1843,6 +1972,7 @@ atomic_long_dec_if_positive(atomic_long_t *v) ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ + kcsan_mb(); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \ arch_try_cmpxchg(__ai_ptr, __ai_oldp, __VA_ARGS__); \ @@ -1861,6 +1991,7 @@ atomic_long_dec_if_positive(atomic_long_t *v) ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ + kcsan_release(); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \ arch_try_cmpxchg_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \ @@ -1892,6 +2023,7 @@ atomic_long_dec_if_positive(atomic_long_t *v) #define sync_cmpxchg(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ + kcsan_mb(); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_sync_cmpxchg(__ai_ptr, __VA_ARGS__); \ }) @@ -1899,6 +2031,7 @@ atomic_long_dec_if_positive(atomic_long_t *v) #define cmpxchg_double(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ + kcsan_mb(); \ instrument_atomic_write(__ai_ptr, 2 * sizeof(*__ai_ptr)); \ arch_cmpxchg_double(__ai_ptr, __VA_ARGS__); \ }) @@ -1912,4 +2045,4 @@ atomic_long_dec_if_positive(atomic_long_t *v) }) #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ -// 2a9553f0a9d5619f19151092df5cabbbf16ce835 +// 87c974b93032afd42143613434d1a7788fa598f9 -- cgit v1.2.3 From a015b7085979b12e55f67f3b86be0321fff6be3f Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Tue, 30 Nov 2021 12:44:32 +0100 Subject: compiler_attributes.h: Add __disable_sanitizer_instrumentation The new attribute maps to __attribute__((disable_sanitizer_instrumentation)), which will be supported by Clang >= 14.0. Future support in GCC is also possible. This attribute disables compiler instrumentation for kernel sanitizer tools, making it easier to implement noinstr. It is different from the existing __no_sanitize* attributes, which may still allow certain types of instrumentation to prevent false positives. Signed-off-by: Alexander Potapenko Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/compiler_attributes.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index b9121afd8733..37e260020221 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -308,6 +308,24 @@ # define __compiletime_warning(msg) #endif +/* + * Optional: only supported since clang >= 14.0 + * + * clang: https://clang.llvm.org/docs/AttributeReference.html#disable-sanitizer-instrumentation + * + * disable_sanitizer_instrumentation is not always similar to + * no_sanitize(()): the latter may still let specific sanitizers + * insert code into functions to prevent false positives. Unlike that, + * disable_sanitizer_instrumentation prevents all kinds of instrumentation to + * functions with the attribute. + */ +#if __has_attribute(disable_sanitizer_instrumentation) +# define __disable_sanitizer_instrumentation \ + __attribute__((disable_sanitizer_instrumentation)) +#else +# define __disable_sanitizer_instrumentation +#endif + /* * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-weak-function-attribute * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-weak-variable-attribute -- cgit v1.2.3 From bd3d5bd1a0ad386475ea7a3de8a91e7d8a600536 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 30 Nov 2021 12:44:33 +0100 Subject: kcsan: Support WEAK_MEMORY with Clang where no objtool support exists Clang and GCC behave a little differently when it comes to the __no_sanitize_thread attribute, which has valid reasons, and depending on context either one could be right. Traditionally, user space ThreadSanitizer [1] still expects instrumented builtin atomics (to avoid false positives) and __tsan_func_{entry,exit} (to generate meaningful stack traces), even if the function has the attribute no_sanitize("thread"). [1] https://clang.llvm.org/docs/ThreadSanitizer.html#attribute-no-sanitize-thread GCC doesn't follow the same policy (for better or worse), and removes all kinds of instrumentation if no_sanitize is added. Arguably, since this may be a problem for user space ThreadSanitizer, we expect this may change in future. Since KCSAN != ThreadSanitizer, the likelihood of false positives even without barrier instrumentation everywhere, is much lower by design. At least for Clang, however, to fully remove all sanitizer instrumentation, we must add the disable_sanitizer_instrumentation attribute, which is available since Clang 14.0. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/compiler_types.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 1d32f4c03c9e..3c1795fdb568 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -198,9 +198,20 @@ struct ftrace_likely_data { # define __no_kasan_or_inline __always_inline #endif -#define __no_kcsan __no_sanitize_thread #ifdef __SANITIZE_THREAD__ +/* + * Clang still emits instrumentation for __tsan_func_{entry,exit}() and builtin + * atomics even with __no_sanitize_thread (to avoid false positives in userspace + * ThreadSanitizer). The kernel's requirements are stricter and we really do not + * want any instrumentation with __no_kcsan. + * + * Therefore we add __disable_sanitizer_instrumentation where available to + * disable all instrumentation. See Kconfig.kcsan where this is mandatory. + */ +# define __no_kcsan __no_sanitize_thread __disable_sanitizer_instrumentation # define __no_sanitize_or_inline __no_kcsan notrace __maybe_unused +#else +# define __no_kcsan #endif #ifndef __no_sanitize_or_inline -- cgit v1.2.3 From 80d7476fa20a3cc83f76b3b02a7575891d1e7511 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Sat, 4 Dec 2021 13:57:03 +0100 Subject: kcsan: Turn barrier instrumentation into macros Some architectures use barriers in 'extern inline' functions, from which we should not refer to static inline functions. For example, building Alpha with gcc and W=1 shows: ./include/asm-generic/barrier.h:70:30: warning: 'kcsan_rmb' is static but used in inline function 'pmd_offset' which is not static 70 | #define smp_rmb() do { kcsan_rmb(); __smp_rmb(); } while (0) | ^~~~~~~~~ ./arch/alpha/include/asm/pgtable.h:293:9: note: in expansion of macro 'smp_rmb' 293 | smp_rmb(); /* see above */ | ^~~~~~~ Which seems to warn about 6.7.4#3 of the C standard: "An inline definition of a function with external linkage shall not contain a definition of a modifiable object with static or thread storage duration, and shall not contain a reference to an identifier with internal linkage." Fix it by turning barrier instrumentation into macros, which matches definitions in . Perhaps we can revert this change in future, when there are no more 'extern inline' users left. Link: https://lkml.kernel.org/r/202112041334.X44uWZXf-lkp@intel.com Reported-by: kernel test robot Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/kcsan-checks.h | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index 9d2c869167f2..92f3843d9ebb 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -241,28 +241,30 @@ static inline void __kcsan_disable_current(void) { } * disabled with the __no_kcsan function attribute. * * Also see definition of __tsan_atomic_signal_fence() in kernel/kcsan/core.c. + * + * These are all macros, like , since some architectures use them + * in non-static inline functions. */ #define __KCSAN_BARRIER_TO_SIGNAL_FENCE(name) \ - static __always_inline void kcsan_##name(void) \ - { \ + do { \ barrier(); \ __atomic_signal_fence(__KCSAN_BARRIER_TO_SIGNAL_FENCE_##name); \ barrier(); \ - } -__KCSAN_BARRIER_TO_SIGNAL_FENCE(mb) -__KCSAN_BARRIER_TO_SIGNAL_FENCE(wmb) -__KCSAN_BARRIER_TO_SIGNAL_FENCE(rmb) -__KCSAN_BARRIER_TO_SIGNAL_FENCE(release) + } while (0) +#define kcsan_mb() __KCSAN_BARRIER_TO_SIGNAL_FENCE(mb) +#define kcsan_wmb() __KCSAN_BARRIER_TO_SIGNAL_FENCE(wmb) +#define kcsan_rmb() __KCSAN_BARRIER_TO_SIGNAL_FENCE(rmb) +#define kcsan_release() __KCSAN_BARRIER_TO_SIGNAL_FENCE(release) #elif defined(CONFIG_KCSAN_WEAK_MEMORY) && defined(__KCSAN_INSTRUMENT_BARRIERS__) #define kcsan_mb __kcsan_mb #define kcsan_wmb __kcsan_wmb #define kcsan_rmb __kcsan_rmb #define kcsan_release __kcsan_release #else /* CONFIG_KCSAN_WEAK_MEMORY && ... */ -static inline void kcsan_mb(void) { } -static inline void kcsan_wmb(void) { } -static inline void kcsan_rmb(void) { } -static inline void kcsan_release(void) { } +#define kcsan_mb() do { } while (0) +#define kcsan_wmb() do { } while (0) +#define kcsan_rmb() do { } while (0) +#define kcsan_release() do { } while (0) #endif /* CONFIG_KCSAN_WEAK_MEMORY && ... */ /** -- cgit v1.2.3 From db67097aa6f2587b44055f2e16db72a11e17faef Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 9 Dec 2021 21:31:56 -0700 Subject: pktdvd: stop using bdi congestion framework. The bdi congestion framework isn't widely used and should be deprecated. pktdvd makes use of it to track congestion, but this can be done entirely internally to pktdvd, so it doesn't need to use the framework. So introduce a "congested" flag. When waiting for bio_queue_size to drop, set this flag and a var_waitqueue() to wait for it. When bio_queue_size does drop and this flag is set, clear the flag and call wake_up_var(). We don't use a wait_var_event macro for the waiting as we need to set the flag and drop the spinlock before calling schedule() and while that is possible with __wait_var_event(), result is not easy to read. Reviewed-by: Christoph Hellwig Signed-off-by: NeilBrown Link: https://lore.kernel.org/r/163910843527.9928.857338663717630212@noble.neil.brown.name Signed-off-by: Jens Axboe --- include/linux/pktcdvd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h index 174601554b06..c391e694aa26 100644 --- a/include/linux/pktcdvd.h +++ b/include/linux/pktcdvd.h @@ -183,6 +183,8 @@ struct pktcdvd_device spinlock_t lock; /* Serialize access to bio_queue */ struct rb_root bio_queue; /* Work queue of bios we need to handle */ int bio_queue_size; /* Number of nodes in bio_queue */ + bool congested; /* Someone is waiting for bio_queue_size + * to drop. */ sector_t current_sector; /* Keep track of where the elevator is */ atomic_t scan_queue; /* Set to non-zero when pkt_handle_queue */ /* needs to be run. */ -- cgit v1.2.3 From f95711242390d759f69fd67ad46b31491fe904d6 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Wed, 8 Dec 2021 17:43:53 +0000 Subject: EDAC: Add RDDR5 and LRDDR5 memory types Include Registered-DDR5 and Load-Reduced DDR5 in the list of memory types. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211208174356.1997855-2-yazen.ghannam@amd.com --- include/linux/edac.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/edac.h b/include/linux/edac.h index 4207d06996a4..e730b3468719 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -182,6 +182,8 @@ static inline char *mc_event_error_type(const unsigned int err_type) * @MEM_LRDDR4: Load-Reduced DDR4 memory. * @MEM_LPDDR4: Low-Power DDR4 memory. * @MEM_DDR5: Unbuffered DDR5 RAM + * @MEM_RDDR5: Registered DDR5 RAM + * @MEM_LRDDR5: Load-Reduced DDR5 memory. * @MEM_NVDIMM: Non-volatile RAM * @MEM_WIO2: Wide I/O 2. * @MEM_HBM2: High bandwidth Memory Gen 2. @@ -211,6 +213,8 @@ enum mem_type { MEM_LRDDR4, MEM_LPDDR4, MEM_DDR5, + MEM_RDDR5, + MEM_LRDDR5, MEM_NVDIMM, MEM_WIO2, MEM_HBM2, @@ -239,6 +243,8 @@ enum mem_type { #define MEM_FLAG_LRDDR4 BIT(MEM_LRDDR4) #define MEM_FLAG_LPDDR4 BIT(MEM_LPDDR4) #define MEM_FLAG_DDR5 BIT(MEM_DDR5) +#define MEM_FLAG_RDDR5 BIT(MEM_RDDR5) +#define MEM_FLAG_LRDDR5 BIT(MEM_LRDDR5) #define MEM_FLAG_NVDIMM BIT(MEM_NVDIMM) #define MEM_FLAG_WIO2 BIT(MEM_WIO2) #define MEM_FLAG_HBM2 BIT(MEM_HBM2) -- cgit v1.2.3 From 1614b2b11fab29dd4ff31ebba9d266961f5af69e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 29 Nov 2021 14:28:41 +0000 Subject: arch: Make ARCH_STACKWALK independent of STACKTRACE Make arch_stack_walk() available for ARCH_STACKWALK architectures without it being entangled in STACKTRACE. Link: https://lore.kernel.org/lkml/20211022152104.356586621@infradead.org/ Signed-off-by: Peter Zijlstra (Intel) [Mark: rebase, drop unnecessary arm change] Signed-off-by: Mark Rutland Cc: Albert Ou Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Dave Hansen Cc: Heiko Carstens Cc: Ingo Molnar Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Thomas Gleixner Cc: Vasily Gorbik Link: https://lore.kernel.org/r/20211129142849.3056714-2-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- include/linux/stacktrace.h | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index bef158815e83..97455880ac41 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -8,22 +8,6 @@ struct task_struct; struct pt_regs; -#ifdef CONFIG_STACKTRACE -void stack_trace_print(const unsigned long *trace, unsigned int nr_entries, - int spaces); -int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries, - unsigned int nr_entries, int spaces); -unsigned int stack_trace_save(unsigned long *store, unsigned int size, - unsigned int skipnr); -unsigned int stack_trace_save_tsk(struct task_struct *task, - unsigned long *store, unsigned int size, - unsigned int skipnr); -unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, - unsigned int size, unsigned int skipnr); -unsigned int stack_trace_save_user(unsigned long *store, unsigned int size); -unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries); - -/* Internal interfaces. Do not use in generic code */ #ifdef CONFIG_ARCH_STACKWALK /** @@ -76,8 +60,25 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie, void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, const struct pt_regs *regs); +#endif /* CONFIG_ARCH_STACKWALK */ -#else /* CONFIG_ARCH_STACKWALK */ +#ifdef CONFIG_STACKTRACE +void stack_trace_print(const unsigned long *trace, unsigned int nr_entries, + int spaces); +int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries, + unsigned int nr_entries, int spaces); +unsigned int stack_trace_save(unsigned long *store, unsigned int size, + unsigned int skipnr); +unsigned int stack_trace_save_tsk(struct task_struct *task, + unsigned long *store, unsigned int size, + unsigned int skipnr); +unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, + unsigned int size, unsigned int skipnr); +unsigned int stack_trace_save_user(unsigned long *store, unsigned int size); +unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries); + +#ifndef CONFIG_ARCH_STACKWALK +/* Internal interfaces. Do not use in generic code */ struct stack_trace { unsigned int nr_entries, max_entries; unsigned long *entries; -- cgit v1.2.3 From 9ba74e6c9e9d0c5c1e5792a7111fc7d1a0589cb8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Dec 2021 23:44:21 -0800 Subject: net: add networking namespace refcount tracker We have 100+ syzbot reports about netns being dismantled too soon, still unresolved as of today. We think a missing get_net() or an extra put_net() is the root cause. In order to find the bug(s), and be able to spot future ones, this patch adds CONFIG_NET_NS_REFCNT_TRACKER and new helpers to precisely pair all put_net() with corresponding get_net(). To use these helpers, each data structure owning a refcount should also use a "netns_tracker" to pair the get and put. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1a748ee9a421..235d5d082f1a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -48,7 +48,7 @@ #include #include #include -#include +#include struct netpoll_info; struct device; @@ -300,13 +300,6 @@ enum netdev_state_t { __LINK_STATE_TESTING, }; - -#ifdef CONFIG_NET_DEV_REFCNT_TRACKER -typedef struct ref_tracker *netdevice_tracker; -#else -typedef struct {} netdevice_tracker; -#endif - struct gro_list { struct list_head list; int count; -- cgit v1.2.3 From 04a931e58d1944ab3d1e11fdfde1947fbe5b6a37 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Dec 2021 23:44:23 -0800 Subject: net: add netns refcount tracker to struct seq_net_private Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/seq_file_net.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/seq_file_net.h b/include/linux/seq_file_net.h index 0fdbe1ddd8d1..b97912fdbae7 100644 --- a/include/linux/seq_file_net.h +++ b/include/linux/seq_file_net.h @@ -9,7 +9,8 @@ extern struct net init_net; struct seq_net_private { #ifdef CONFIG_NET_NS - struct net *net; + struct net *net; + netns_tracker ns_tracker; #endif }; -- cgit v1.2.3 From 65c7cdedeb3026fabcc967a7aae2f755ad4d0783 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Sep 2021 11:24:17 -0400 Subject: genirq: Provide new interfaces for affinity hints The discussion about removing the side effect of irq_set_affinity_hint() of actually applying the cpumask (if not NULL) as affinity to the interrupt, unearthed a few unpleasantries: 1) The modular perf drivers rely on the current behaviour for the very wrong reasons. 2) While none of the other drivers prevents user space from changing the affinity, a cursorily inspection shows that there are at least expectations in some drivers. #1 needs to be cleaned up anyway, so that's not a problem #2 might result in subtle regressions especially when irqbalanced (which nowadays ignores the affinity hint) is disabled. Provide new interfaces: irq_update_affinity_hint() - Only sets the affinity hint pointer irq_set_affinity_and_hint() - Set the pointer and apply the affinity to the interrupt Make irq_set_affinity_hint() a wrapper around irq_apply_affinity_hint() and document it to be phased out. Signed-off-by: Thomas Gleixner Signed-off-by: Nitesh Narayan Lal Signed-off-by: Thomas Gleixner Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210501021832.743094-1-jesse.brandeburg@intel.com Link: https://lore.kernel.org/r/20210903152430.244937-2-nitesh@redhat.com --- include/linux/interrupt.h | 53 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 1f22a30c0963..9367f1cb2e3c 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -329,7 +329,46 @@ extern int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask); extern int irq_can_set_affinity(unsigned int irq); extern int irq_select_affinity(unsigned int irq); -extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m); +extern int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, + bool setaffinity); + +/** + * irq_update_affinity_hint - Update the affinity hint + * @irq: Interrupt to update + * @m: cpumask pointer (NULL to clear the hint) + * + * Updates the affinity hint, but does not change the affinity of the interrupt. + */ +static inline int +irq_update_affinity_hint(unsigned int irq, const struct cpumask *m) +{ + return __irq_apply_affinity_hint(irq, m, false); +} + +/** + * irq_set_affinity_and_hint - Update the affinity hint and apply the provided + * cpumask to the interrupt + * @irq: Interrupt to update + * @m: cpumask pointer (NULL to clear the hint) + * + * Updates the affinity hint and if @m is not NULL it applies it as the + * affinity of that interrupt. + */ +static inline int +irq_set_affinity_and_hint(unsigned int irq, const struct cpumask *m) +{ + return __irq_apply_affinity_hint(irq, m, true); +} + +/* + * Deprecated. Use irq_update_affinity_hint() or irq_set_affinity_and_hint() + * instead. + */ +static inline int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) +{ + return irq_set_affinity_and_hint(irq, m); +} + extern int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity); @@ -361,6 +400,18 @@ static inline int irq_can_set_affinity(unsigned int irq) static inline int irq_select_affinity(unsigned int irq) { return 0; } +static inline int irq_update_affinity_hint(unsigned int irq, + const struct cpumask *m) +{ + return -EINVAL; +} + +static inline int irq_set_affinity_and_hint(unsigned int irq, + const struct cpumask *m) +{ + return -EINVAL; +} + static inline int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { -- cgit v1.2.3 From e4779015fd5d2fb8390c258268addff24d6077c7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 10 Dec 2021 14:46:22 -0800 Subject: timers: implement usleep_idle_range() Patch series "mm/damon: Fix fake /proc/loadavg reports", v3. This patchset fixes DAMON's fake load report issue. The first patch makes yet another variant of usleep_range() for this fix, and the second patch fixes the issue of DAMON by making it using the newly introduced function. This patch (of 2): Some kernel threads such as DAMON could need to repeatedly sleep in micro seconds level. Because usleep_range() sleeps in uninterruptible state, however, such threads would make /proc/loadavg reports fake load. To help such cases, this commit implements a variant of usleep_range() called usleep_idle_range(). It is same to usleep_range() but sets the state of the current task as TASK_IDLE while sleeping. Link: https://lkml.kernel.org/r/20211126145015.15862-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211126145015.15862-2-sj@kernel.org Signed-off-by: SeongJae Park Suggested-by: Andrew Morton Reviewed-by: Thomas Gleixner Tested-by: Oleksandr Natalenko Cc: John Stultz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delay.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/delay.h b/include/linux/delay.h index 8eacf67eb212..039e7e0c7378 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -20,6 +20,7 @@ */ #include +#include extern unsigned long loops_per_jiffy; @@ -58,7 +59,18 @@ void calibrate_delay(void); void __attribute__((weak)) calibration_delay_done(void); void msleep(unsigned int msecs); unsigned long msleep_interruptible(unsigned int msecs); -void usleep_range(unsigned long min, unsigned long max); +void usleep_range_state(unsigned long min, unsigned long max, + unsigned int state); + +static inline void usleep_range(unsigned long min, unsigned long max) +{ + usleep_range_state(min, max, TASK_UNINTERRUPTIBLE); +} + +static inline void usleep_idle_range(unsigned long min, unsigned long max) +{ + usleep_range_state(min, max, TASK_IDLE); +} static inline void ssleep(unsigned int seconds) { -- cgit v1.2.3 From bff8c3848e071d387d8b0784dc91fa49cd563774 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Nov 2021 11:01:03 +0100 Subject: bitfield.h: Fix "type of reg too small for mask" test The test: 'mask > (typeof(_reg))~0ull' only works correctly when both sides are unsigned, consider: - 0xff000000 vs (int)~0ull - 0x000000ff vs (int)~0ull Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20211110101324.950210584@infradead.org --- include/linux/bitfield.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h index 4e035aca6f7e..6093fa6db260 100644 --- a/include/linux/bitfield.h +++ b/include/linux/bitfield.h @@ -41,6 +41,22 @@ #define __bf_shf(x) (__builtin_ffsll(x) - 1) +#define __scalar_type_to_unsigned_cases(type) \ + unsigned type: (unsigned type)0, \ + signed type: (unsigned type)0 + +#define __unsigned_scalar_typeof(x) typeof( \ + _Generic((x), \ + char: (unsigned char)0, \ + __scalar_type_to_unsigned_cases(char), \ + __scalar_type_to_unsigned_cases(short), \ + __scalar_type_to_unsigned_cases(int), \ + __scalar_type_to_unsigned_cases(long), \ + __scalar_type_to_unsigned_cases(long long), \ + default: (x))) + +#define __bf_cast_unsigned(type, x) ((__unsigned_scalar_typeof(type))(x)) + #define __BF_FIELD_CHECK(_mask, _reg, _val, _pfx) \ ({ \ BUILD_BUG_ON_MSG(!__builtin_constant_p(_mask), \ @@ -49,7 +65,8 @@ BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ? \ ~((_mask) >> __bf_shf(_mask)) & (_val) : 0, \ _pfx "value too large for the field"); \ - BUILD_BUG_ON_MSG((_mask) > (typeof(_reg))~0ull, \ + BUILD_BUG_ON_MSG(__bf_cast_unsigned(_mask, _mask) > \ + __bf_cast_unsigned(_reg, ~0ull), \ _pfx "type of reg too small for mask"); \ __BUILD_BUG_ON_NOT_POWER_OF_2((_mask) + \ (1ULL << __bf_shf(_mask))); \ -- cgit v1.2.3 From 4121485d271bd730537f613ce041e7ea659606a7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 9 Dec 2021 21:52:31 +0200 Subject: PCI: Sort Intel Device IDs by value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sort Intel Device IDs by value. [bhelgaas: lower-case Intel section since we're touching it anyway] Link: https://lore.kernel.org/r/20211209195231.2785-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Bjorn Helgaas Reviewed-by: Krzysztof Wilczyński --- include/linux/pci_ids.h | 50 ++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 011f2f1ea5bb..0d26ab7eb7dc 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2635,8 +2635,8 @@ #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320 #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321 #define PCI_DEVICE_ID_INTEL_PXH_0 0x0329 -#define PCI_DEVICE_ID_INTEL_PXH_1 0x032A -#define PCI_DEVICE_ID_INTEL_PXHV 0x032C +#define PCI_DEVICE_ID_INTEL_PXH_1 0x032a +#define PCI_DEVICE_ID_INTEL_PXHV 0x032c #define PCI_DEVICE_ID_INTEL_80332_0 0x0330 #define PCI_DEVICE_ID_INTEL_80332_1 0x0332 #define PCI_DEVICE_ID_INTEL_80333_0 0x0370 @@ -2654,14 +2654,14 @@ #define PCI_DEVICE_ID_INTEL_MFD_SDIO2 0x0822 #define PCI_DEVICE_ID_INTEL_MFD_EMMC0 0x0823 #define PCI_DEVICE_ID_INTEL_MFD_EMMC1 0x0824 -#define PCI_DEVICE_ID_INTEL_MRST_SD2 0x084F -#define PCI_DEVICE_ID_INTEL_QUARK_X1000_ILB 0x095E +#define PCI_DEVICE_ID_INTEL_MRST_SD2 0x084f +#define PCI_DEVICE_ID_INTEL_QUARK_X1000_ILB 0x095e #define PCI_DEVICE_ID_INTEL_I960 0x0960 #define PCI_DEVICE_ID_INTEL_I960RM 0x0962 #define PCI_DEVICE_ID_INTEL_CENTERTON_ILB 0x0c60 #define PCI_DEVICE_ID_INTEL_8257X_SOL 0x1062 #define PCI_DEVICE_ID_INTEL_82573E_SOL 0x1085 -#define PCI_DEVICE_ID_INTEL_82573L_SOL 0x108F +#define PCI_DEVICE_ID_INTEL_82573L_SOL 0x108f #define PCI_DEVICE_ID_INTEL_82815_MC 0x1130 #define PCI_DEVICE_ID_INTEL_82815_CGC 0x1132 #define PCI_DEVICE_ID_INTEL_82092AA_0 0x1221 @@ -2755,12 +2755,6 @@ #define PCI_DEVICE_ID_INTEL_82801EB_11 0x24db #define PCI_DEVICE_ID_INTEL_82801EB_12 0x24dc #define PCI_DEVICE_ID_INTEL_82801EB_13 0x24dd -#define PCI_DEVICE_ID_INTEL_ESB_1 0x25a1 -#define PCI_DEVICE_ID_INTEL_ESB_2 0x25a2 -#define PCI_DEVICE_ID_INTEL_ESB_4 0x25a4 -#define PCI_DEVICE_ID_INTEL_ESB_5 0x25a6 -#define PCI_DEVICE_ID_INTEL_ESB_9 0x25ab -#define PCI_DEVICE_ID_INTEL_ESB_10 0x25ac #define PCI_DEVICE_ID_INTEL_82820_HB 0x2500 #define PCI_DEVICE_ID_INTEL_82820_UP_HB 0x2501 #define PCI_DEVICE_ID_INTEL_82850_HB 0x2530 @@ -2775,14 +2769,15 @@ #define PCI_DEVICE_ID_INTEL_82915G_IG 0x2582 #define PCI_DEVICE_ID_INTEL_82915GM_HB 0x2590 #define PCI_DEVICE_ID_INTEL_82915GM_IG 0x2592 -#define PCI_DEVICE_ID_INTEL_5000_ERR 0x25F0 -#define PCI_DEVICE_ID_INTEL_5000_FBD0 0x25F5 -#define PCI_DEVICE_ID_INTEL_5000_FBD1 0x25F6 -#define PCI_DEVICE_ID_INTEL_82945G_HB 0x2770 -#define PCI_DEVICE_ID_INTEL_82945G_IG 0x2772 -#define PCI_DEVICE_ID_INTEL_3000_HB 0x2778 -#define PCI_DEVICE_ID_INTEL_82945GM_HB 0x27A0 -#define PCI_DEVICE_ID_INTEL_82945GM_IG 0x27A2 +#define PCI_DEVICE_ID_INTEL_ESB_1 0x25a1 +#define PCI_DEVICE_ID_INTEL_ESB_2 0x25a2 +#define PCI_DEVICE_ID_INTEL_ESB_4 0x25a4 +#define PCI_DEVICE_ID_INTEL_ESB_5 0x25a6 +#define PCI_DEVICE_ID_INTEL_ESB_9 0x25ab +#define PCI_DEVICE_ID_INTEL_ESB_10 0x25ac +#define PCI_DEVICE_ID_INTEL_5000_ERR 0x25f0 +#define PCI_DEVICE_ID_INTEL_5000_FBD0 0x25f5 +#define PCI_DEVICE_ID_INTEL_5000_FBD1 0x25f6 #define PCI_DEVICE_ID_INTEL_ICH6_0 0x2640 #define PCI_DEVICE_ID_INTEL_ICH6_1 0x2641 #define PCI_DEVICE_ID_INTEL_ICH6_2 0x2642 @@ -2794,6 +2789,11 @@ #define PCI_DEVICE_ID_INTEL_ESB2_14 0x2698 #define PCI_DEVICE_ID_INTEL_ESB2_17 0x269b #define PCI_DEVICE_ID_INTEL_ESB2_18 0x269e +#define PCI_DEVICE_ID_INTEL_82945G_HB 0x2770 +#define PCI_DEVICE_ID_INTEL_82945G_IG 0x2772 +#define PCI_DEVICE_ID_INTEL_3000_HB 0x2778 +#define PCI_DEVICE_ID_INTEL_82945GM_HB 0x27a0 +#define PCI_DEVICE_ID_INTEL_82945GM_IG 0x27a2 #define PCI_DEVICE_ID_INTEL_ICH7_0 0x27b8 #define PCI_DEVICE_ID_INTEL_ICH7_1 0x27b9 #define PCI_DEVICE_ID_INTEL_ICH7_30 0x27b0 @@ -2846,7 +2846,7 @@ #define PCI_DEVICE_ID_INTEL_LYNNFIELD_QPI_PHY0 0x2c91 #define PCI_DEVICE_ID_INTEL_LYNNFIELD_MCR 0x2c98 #define PCI_DEVICE_ID_INTEL_LYNNFIELD_MC_TAD 0x2c99 -#define PCI_DEVICE_ID_INTEL_LYNNFIELD_MC_TEST 0x2c9C +#define PCI_DEVICE_ID_INTEL_LYNNFIELD_MC_TEST 0x2c9c #define PCI_DEVICE_ID_INTEL_LYNNFIELD_MC_CH0_CTRL 0x2ca0 #define PCI_DEVICE_ID_INTEL_LYNNFIELD_MC_CH0_ADDR 0x2ca1 #define PCI_DEVICE_ID_INTEL_LYNNFIELD_MC_CH0_RANK 0x2ca2 @@ -2958,16 +2958,16 @@ #define PCI_DEVICE_ID_INTEL_SBRIDGE_BR 0x3cf5 /* 13.6 */ #define PCI_DEVICE_ID_INTEL_SBRIDGE_SAD1 0x3cf6 /* 12.7 */ #define PCI_DEVICE_ID_INTEL_IOAT_SNB 0x402f -#define PCI_DEVICE_ID_INTEL_5100_16 0x65f0 -#define PCI_DEVICE_ID_INTEL_5100_19 0x65f3 -#define PCI_DEVICE_ID_INTEL_5100_21 0x65f5 -#define PCI_DEVICE_ID_INTEL_5100_22 0x65f6 #define PCI_DEVICE_ID_INTEL_5400_ERR 0x4030 #define PCI_DEVICE_ID_INTEL_5400_FBD0 0x4035 #define PCI_DEVICE_ID_INTEL_5400_FBD1 0x4036 -#define PCI_DEVICE_ID_INTEL_IOAT_SCNB 0x65ff #define PCI_DEVICE_ID_INTEL_EP80579_0 0x5031 #define PCI_DEVICE_ID_INTEL_EP80579_1 0x5032 +#define PCI_DEVICE_ID_INTEL_5100_16 0x65f0 +#define PCI_DEVICE_ID_INTEL_5100_19 0x65f3 +#define PCI_DEVICE_ID_INTEL_5100_21 0x65f5 +#define PCI_DEVICE_ID_INTEL_5100_22 0x65f6 +#define PCI_DEVICE_ID_INTEL_IOAT_SCNB 0x65ff #define PCI_DEVICE_ID_INTEL_82371SB_0 0x7000 #define PCI_DEVICE_ID_INTEL_82371SB_1 0x7010 #define PCI_DEVICE_ID_INTEL_82371SB_2 0x7020 -- cgit v1.2.3 From c5fb19937455095573a19ddcbff32e993ed10e35 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 10 Dec 2021 22:16:49 +0800 Subject: bpf: Add bpf_strncmp helper The helper compares two strings: one string is a null-terminated read-only string, and another string has const max storage size but doesn't need to be null-terminated. It can be used to compare file name in tracing or LSM program. Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211210141652.877186-2-houtao1@huawei.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0ceb54c6342f..7a40022e3d00 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2163,6 +2163,7 @@ extern const struct bpf_func_proto bpf_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto; extern const struct bpf_func_proto bpf_find_vma_proto; extern const struct bpf_func_proto bpf_loop_proto; +extern const struct bpf_func_proto bpf_strncmp_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); -- cgit v1.2.3 From 35d976802124303a5b3eb7ec3ed188d568204373 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 10 Dec 2021 01:34:38 +0200 Subject: net: dsa: tag_ocelot: convert to tagger-owned data The felix driver makes very light use of dp->priv, and the tagger is effectively stateless. dp->priv is practically only needed to set up a callback to perform deferred xmit of PTP and STP packets using the ocelot-8021q tagging protocol (the main ocelot tagging protocol makes no use of dp->priv, although this driver sets up dp->priv irrespective of actual tagging protocol in use). struct felix_port (what used to be pointed to by dp->priv) is removed and replaced with a two-sided structure. The public side of this structure, visible to the switch driver, is ocelot_8021q_tagger_data. The private side is ocelot_8021q_tagger_private, and the latter structure physically encapsulates the former. The public half of the tagger data structure can be accessed through a helper of the same name (ocelot_8021q_tagger_data) which also sanity-checks the protocol currently in use by the switch. The public/private split was requested by Andrew Lunn. Suggested-by: Andrew Lunn Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/ocelot.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index 7ee708ad7df2..dca2969015d8 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -8,6 +8,7 @@ #include #include #include +#include struct ocelot_skb_cb { struct sk_buff *clone; @@ -168,11 +169,18 @@ struct felix_deferred_xmit_work { struct kthread_work work; }; -struct felix_port { +struct ocelot_8021q_tagger_data { void (*xmit_work_fn)(struct kthread_work *work); - struct kthread_worker *xmit_worker; }; +static inline struct ocelot_8021q_tagger_data * +ocelot_8021q_tagger_data(struct dsa_switch *ds) +{ + BUG_ON(ds->dst->tag_ops->proto != DSA_TAG_PROTO_OCELOT_8021Q); + + return ds->tagger_data; +} + static inline void ocelot_xfh_get_rew_val(void *extraction, u64 *rew_val) { packing(extraction, rew_val, 116, 85, OCELOT_TAG_LEN, UNPACK, 0); -- cgit v1.2.3 From d38049bbe7601f38d598f5da5ff09980483b290a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 10 Dec 2021 01:34:40 +0200 Subject: net: dsa: sja1105: bring deferred xmit implementation in line with ocelot-8021q When the ocelot-8021q driver was converted to deferred xmit as part of commit 8d5f7954b7c8 ("net: dsa: felix: break at first CPU port during init and teardown"), the deferred implementation was deliberately made subtly different from what sja1105 has. The implementation differences lied on the following observations: - There might be a race between these two lines in tag_sja1105.c: skb_queue_tail(&sp->xmit_queue, skb_get(skb)); kthread_queue_work(sp->xmit_worker, &sp->xmit_work); and the skb dequeue logic in sja1105_port_deferred_xmit(). For example, the xmit_work might be already queued, however the work item has just finished walking through the skb queue. Because we don't check the return code from kthread_queue_work, we don't do anything if the work item is already queued. However, nobody will take that skb and send it, at least until the next timestampable skb is sent. This creates additional (and avoidable) TX timestamping latency. To close that race, what the ocelot-8021q driver does is it doesn't keep a single work item per port, and a skb timestamping queue, but rather dynamically allocates a work item per packet. - It is also unnecessary to have more than one kthread that does the work. So delete the per-port kthread allocations and replace them with a single kthread which is global to the switch. This change brings the two implementations in line by applying those observations to the sja1105 driver as well. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index e6c78be40bde..acd9d2afccab 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -37,6 +37,12 @@ #define SJA1105_HWTS_RX_EN 0 +struct sja1105_deferred_xmit_work { + struct dsa_port *dp; + struct sk_buff *skb; + struct kthread_work work; +}; + /* Global tagger data: each struct sja1105_port has a reference to * the structure defined in struct sja1105_private. */ @@ -52,6 +58,8 @@ struct sja1105_tagger_data { * 2-step TX timestamps */ struct sk_buff_head skb_txtstamp_queue; + struct kthread_worker *xmit_worker; + void (*xmit_work_fn)(struct kthread_work *work); }; struct sja1105_skb_cb { @@ -65,9 +73,6 @@ struct sja1105_skb_cb { ((struct sja1105_skb_cb *)((skb)->cb)) struct sja1105_port { - struct kthread_worker *xmit_worker; - struct kthread_work xmit_work; - struct sk_buff_head xmit_queue; struct sja1105_tagger_data *data; bool hwts_tx_en; }; -- cgit v1.2.3 From 6f6770ab1ce2b56619264ec6be0b62f05564dcf6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 10 Dec 2021 01:34:41 +0200 Subject: net: dsa: sja1105: remove hwts_tx_en from tagger data This tagger property is in fact not used at all by the tagger, only by the switch driver. Therefore it makes sense to be moved to sja1105_private. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index acd9d2afccab..32a8a1344cf6 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -74,7 +74,6 @@ struct sja1105_skb_cb { struct sja1105_port { struct sja1105_tagger_data *data; - bool hwts_tx_en; }; /* Timestamps are in units of 8 ns clock ticks (equivalent to -- cgit v1.2.3 From bfcf1425222008e7390c0784b0f3bb7b497fccaa Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 10 Dec 2021 01:34:42 +0200 Subject: net: dsa: sja1105: make dp->priv point directly to sja1105_tagger_data The design of the sja1105 tagger dp->priv is that each port has a separate struct sja1105_port, and the sp->data pointer points to a common struct sja1105_tagger_data. We have removed all per-port members accessible by the tagger, and now only struct sja1105_tagger_data remains. Make dp->priv point directly to this. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 32a8a1344cf6..1dda9cce85d9 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -43,9 +43,7 @@ struct sja1105_deferred_xmit_work { struct kthread_work work; }; -/* Global tagger data: each struct sja1105_port has a reference to - * the structure defined in struct sja1105_private. - */ +/* Global tagger data */ struct sja1105_tagger_data { struct sk_buff *stampable_skb; /* Protects concurrent access to the meta state machine @@ -72,10 +70,6 @@ struct sja1105_skb_cb { #define SJA1105_SKB_CB(skb) \ ((struct sja1105_skb_cb *)((skb)->cb)) -struct sja1105_port { - struct sja1105_tagger_data *data; -}; - /* Timestamps are in units of 8 ns clock ticks (equivalent to * a fixed 125 MHz clock). */ -- cgit v1.2.3 From 22ee9f8e4011fb8a6d75dc2f9a43360d4f400235 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 10 Dec 2021 01:34:43 +0200 Subject: net: dsa: sja1105: move ts_id from sja1105_tagger_data The TX timestamp ID is incremented by the SJA1110 PTP timestamping callback (->port_tx_timestamp) for every packet, when cloning it. It isn't used by the tagger at all, even though it sits inside the struct sja1105_tagger_data. Also, serialization to this structure is currently done through tagger_data->meta_lock, which is a cheap hack because the meta_lock isn't used for anything else on SJA1110 (sja1105_rcv_meta_state_machine isn't called). This change moves ts_id from sja1105_tagger_data to sja1105_private and introduces a dedicated spinlock for it, also in sja1105_private. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 1dda9cce85d9..d8ee53085c09 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -51,7 +51,6 @@ struct sja1105_tagger_data { */ spinlock_t meta_lock; unsigned long state; - u8 ts_id; /* Used on SJA1110 where meta frames are generated only for * 2-step TX timestamps */ -- cgit v1.2.3 From c79e84866d2ac637fce921a28288f214e91d662b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 10 Dec 2021 01:34:44 +0200 Subject: net: dsa: tag_sja1105: convert to tagger-owned data Currently, struct sja1105_tagger_data is a part of struct sja1105_private, and is used by the sja1105 driver to populate dp->priv. With the movement towards tagger-owned storage, the sja1105 driver should not be the owner of this memory. This change implements the connection between the sja1105 switch driver and its tagging protocol, which means that sja1105_tagger_data no longer stays in dp->priv but in ds->tagger_data, and that the sja1105 driver now only populates the sja1105_port_deferred_xmit callback pointer. The kthread worker is now the responsibility of the tagger. The sja1105 driver also alters the tagger's state some more, especially with regard to the PTP RX timestamping state. This will be fixed up a bit in further changes. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index d8ee53085c09..9f7d42cbbc08 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -84,9 +84,12 @@ static inline s64 sja1105_ticks_to_ns(s64 ticks) return ticks * SJA1105_TICK_NS; } -static inline bool dsa_port_is_sja1105(struct dsa_port *dp) +static inline struct sja1105_tagger_data * +sja1105_tagger_data(struct dsa_switch *ds) { - return true; + BUG_ON(ds->dst->tag_ops->proto != DSA_TAG_PROTO_SJA1105); + + return ds->tagger_data; } #endif /* _NET_DSA_SJA1105_H */ -- cgit v1.2.3 From fcbf979a5b4b5784bfb5647ae6190cd5c2ae595d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 10 Dec 2021 01:34:45 +0200 Subject: Revert "net: dsa: move sja1110_process_meta_tstamp inside the tagging protocol driver" This reverts commit 6d709cadfde68dbd12bef12fcced6222226dcb06. The above change was done to avoid calling symbols exported by the switch driver from the tagging protocol driver. With the tagger-owned storage model, we have a new option on our hands, and that is for the switch driver to provide a data consumer handler in the form of a function pointer inside the ->connect_tag_protocol() method. Having a function pointer avoids the problems of the exported symbols approach. By creating a handler for metadata frames holding TX timestamps on SJA1110, we are able to eliminate an skb queue from the tagger data, and replace it with a simple, and stateless, function pointer. This skb queue is now handled exclusively by sja1105_ptp.c, which makes the code easier to follow, as it used to be before the reverted patch. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 9f7d42cbbc08..d216211b64f8 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -37,6 +37,11 @@ #define SJA1105_HWTS_RX_EN 0 +enum sja1110_meta_tstamp { + SJA1110_META_TSTAMP_TX = 0, + SJA1110_META_TSTAMP_RX = 1, +}; + struct sja1105_deferred_xmit_work { struct dsa_port *dp; struct sk_buff *skb; @@ -51,12 +56,10 @@ struct sja1105_tagger_data { */ spinlock_t meta_lock; unsigned long state; - /* Used on SJA1110 where meta frames are generated only for - * 2-step TX timestamps - */ - struct sk_buff_head skb_txtstamp_queue; struct kthread_worker *xmit_worker; void (*xmit_work_fn)(struct kthread_work *work); + void (*meta_tstamp_handler)(struct dsa_switch *ds, int port, u8 ts_id, + enum sja1110_meta_tstamp dir, u64 tstamp); }; struct sja1105_skb_cb { @@ -69,21 +72,6 @@ struct sja1105_skb_cb { #define SJA1105_SKB_CB(skb) \ ((struct sja1105_skb_cb *)((skb)->cb)) -/* Timestamps are in units of 8 ns clock ticks (equivalent to - * a fixed 125 MHz clock). - */ -#define SJA1105_TICK_NS 8 - -static inline s64 ns_to_sja1105_ticks(s64 ns) -{ - return ns / SJA1105_TICK_NS; -} - -static inline s64 sja1105_ticks_to_ns(s64 ticks) -{ - return ticks * SJA1105_TICK_NS; -} - static inline struct sja1105_tagger_data * sja1105_tagger_data(struct dsa_switch *ds) { -- cgit v1.2.3 From 950a419d9de13668f86828394cb242a1f9dece74 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 10 Dec 2021 01:34:46 +0200 Subject: net: dsa: tag_sja1105: split sja1105_tagger_data into private and public sections The sja1105 driver messes with the tagging protocol's state when PTP RX timestamping is enabled/disabled. This is fundamentally necessary because the tagger needs to know what to do when it receives a PTP packet. If RX timestamping is enabled, then a metadata follow-up frame is expected, and this holds the (partial) timestamp. So the tagger plays hide-and-seek with the network stack until it also gets the metadata frame, and then presents a single packet, the timestamped PTP packet. But when RX timestamping isn't enabled, there is no metadata frame expected, so the hide-and-seek game must be turned off and the packet must be delivered right away to the network stack. Considering this, we create a pseudo isolation by devising two tagger methods callable by the switch: one to get the RX timestamping state, and one to set it. Since we can't export symbols between the tagger and the switch driver, these methods are exposed through function pointers. After this change, the public portion of the sja1105_tagger_data contains only function pointers. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index d216211b64f8..e9cb1ae6d742 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -35,8 +35,6 @@ #define SJA1105_META_SMAC 0x222222222222ull #define SJA1105_META_DMAC 0x0180C200000Eull -#define SJA1105_HWTS_RX_EN 0 - enum sja1110_meta_tstamp { SJA1110_META_TSTAMP_TX = 0, SJA1110_META_TSTAMP_RX = 1, @@ -50,16 +48,13 @@ struct sja1105_deferred_xmit_work { /* Global tagger data */ struct sja1105_tagger_data { - struct sk_buff *stampable_skb; - /* Protects concurrent access to the meta state machine - * from taggers running on multiple ports on SMP systems - */ - spinlock_t meta_lock; - unsigned long state; - struct kthread_worker *xmit_worker; + /* Tagger to switch */ void (*xmit_work_fn)(struct kthread_work *work); void (*meta_tstamp_handler)(struct dsa_switch *ds, int port, u8 ts_id, enum sja1110_meta_tstamp dir, u64 tstamp); + /* Switch to tagger */ + bool (*rxtstamp_get_state)(struct dsa_switch *ds); + void (*rxtstamp_set_state)(struct dsa_switch *ds, bool on); }; struct sja1105_skb_cb { -- cgit v1.2.3 From 9020ef659885f2622cfb386cc229b6d618362895 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 17 Oct 2021 18:22:09 +0100 Subject: iio: trigger: Fix a scheduling whilst atomic issue seen on tsc2046 IIO triggers are software IRQ chips that split an incoming IRQ into separate IRQs routed to all devices using the trigger. When all consumers are done then a trigger callback reenable() is called. There are a few circumstances under which this can happen in atomic context. 1) A single user of the trigger that calls the iio_trigger_done() function from interrupt context. 2) A race between disconnecting the last device from a trigger and the trigger itself sucessfully being disabled. To avoid a resulting scheduling whilst atomic, close this second corner by using schedule_work() to ensure the reenable is not done in atomic context. Note that drivers must be careful to manage the interaction of set_state() and reenable() callbacks to ensure appropriate reference counting if they are relying on the same hardware controls. Deliberately taking this the slow path rather than via a fixes tree because the error has hard to hit and I would like it to soak for a while before hitting a release kernel. Signed-off-by: Jonathan Cameron Cc: Pengutronix Kernel Team Cc: Dmitry Torokhov Tested-by: Oleksij Rempel Cc: Link: https://lore.kernel.org/r/20211017172209.112387-1-jic23@kernel.org --- include/linux/iio/trigger.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/trigger.h b/include/linux/iio/trigger.h index 096f68dd2e0c..4c69b144677b 100644 --- a/include/linux/iio/trigger.h +++ b/include/linux/iio/trigger.h @@ -55,6 +55,7 @@ struct iio_trigger_ops { * @attached_own_device:[INTERN] if we are using our own device as trigger, * i.e. if we registered a poll function to the same * device as the one providing the trigger. + * @reenable_work: [INTERN] work item used to ensure reenable can sleep. **/ struct iio_trigger { const struct iio_trigger_ops *ops; @@ -74,6 +75,7 @@ struct iio_trigger { unsigned long pool[BITS_TO_LONGS(CONFIG_IIO_CONSUMERS_PER_TRIGGER)]; struct mutex pool_lock; bool attached_own_device; + struct work_struct reenable_work; }; -- cgit v1.2.3 From 3ac27afefd5dd6a53e830542b899f092a58b6b51 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 5 Dec 2021 17:01:29 +0000 Subject: iio:dac:ad5755: Switch to generic firmware properties and drop pdata Lars pointed out that platform data can also be supported via the generic properties interface, so there is no point in continuing to support it separately. Hence squish the linux/platform_data/ad5755.h header into the c file and drop accessing the platform data directly. Done by inspection only. Mostly completely mechanical with the exception of a few places where default value handling is cleaner done by first setting the value, then calling the firmware reading function but and not checking the return value, as opposed to reading firmware then setting the default if an error occurs. Part of general attempt to move all of IIO over to generic device properties, both to enable other firmware types and to remove drivers that can be the source of of_ specific behaviour in new drivers. Suggested-by: Lars-Peter Clausen Signed-off-by: Jonathan Cameron Reviewed-by: Andy Shevchenko --- include/linux/platform_data/ad5755.h | 102 ----------------------------------- 1 file changed, 102 deletions(-) delete mode 100644 include/linux/platform_data/ad5755.h (limited to 'include/linux') diff --git a/include/linux/platform_data/ad5755.h b/include/linux/platform_data/ad5755.h deleted file mode 100644 index e371e08f04bc..000000000000 --- a/include/linux/platform_data/ad5755.h +++ /dev/null @@ -1,102 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2012 Analog Devices Inc. - */ -#ifndef __LINUX_PLATFORM_DATA_AD5755_H__ -#define __LINUX_PLATFORM_DATA_AD5755_H__ - -enum ad5755_mode { - AD5755_MODE_VOLTAGE_0V_5V = 0, - AD5755_MODE_VOLTAGE_0V_10V = 1, - AD5755_MODE_VOLTAGE_PLUSMINUS_5V = 2, - AD5755_MODE_VOLTAGE_PLUSMINUS_10V = 3, - AD5755_MODE_CURRENT_4mA_20mA = 4, - AD5755_MODE_CURRENT_0mA_20mA = 5, - AD5755_MODE_CURRENT_0mA_24mA = 6, -}; - -enum ad5755_dc_dc_phase { - AD5755_DC_DC_PHASE_ALL_SAME_EDGE = 0, - AD5755_DC_DC_PHASE_A_B_SAME_EDGE_C_D_OPP_EDGE = 1, - AD5755_DC_DC_PHASE_A_C_SAME_EDGE_B_D_OPP_EDGE = 2, - AD5755_DC_DC_PHASE_90_DEGREE = 3, -}; - -enum ad5755_dc_dc_freq { - AD5755_DC_DC_FREQ_250kHZ = 0, - AD5755_DC_DC_FREQ_410kHZ = 1, - AD5755_DC_DC_FREQ_650kHZ = 2, -}; - -enum ad5755_dc_dc_maxv { - AD5755_DC_DC_MAXV_23V = 0, - AD5755_DC_DC_MAXV_24V5 = 1, - AD5755_DC_DC_MAXV_27V = 2, - AD5755_DC_DC_MAXV_29V5 = 3, -}; - -enum ad5755_slew_rate { - AD5755_SLEW_RATE_64k = 0, - AD5755_SLEW_RATE_32k = 1, - AD5755_SLEW_RATE_16k = 2, - AD5755_SLEW_RATE_8k = 3, - AD5755_SLEW_RATE_4k = 4, - AD5755_SLEW_RATE_2k = 5, - AD5755_SLEW_RATE_1k = 6, - AD5755_SLEW_RATE_500 = 7, - AD5755_SLEW_RATE_250 = 8, - AD5755_SLEW_RATE_125 = 9, - AD5755_SLEW_RATE_64 = 10, - AD5755_SLEW_RATE_32 = 11, - AD5755_SLEW_RATE_16 = 12, - AD5755_SLEW_RATE_8 = 13, - AD5755_SLEW_RATE_4 = 14, - AD5755_SLEW_RATE_0_5 = 15, -}; - -enum ad5755_slew_step_size { - AD5755_SLEW_STEP_SIZE_1 = 0, - AD5755_SLEW_STEP_SIZE_2 = 1, - AD5755_SLEW_STEP_SIZE_4 = 2, - AD5755_SLEW_STEP_SIZE_8 = 3, - AD5755_SLEW_STEP_SIZE_16 = 4, - AD5755_SLEW_STEP_SIZE_32 = 5, - AD5755_SLEW_STEP_SIZE_64 = 6, - AD5755_SLEW_STEP_SIZE_128 = 7, - AD5755_SLEW_STEP_SIZE_256 = 8, -}; - -/** - * struct ad5755_platform_data - AD5755 DAC driver platform data - * @ext_dc_dc_compenstation_resistor: Whether an external DC-DC converter - * compensation register is used. - * @dc_dc_phase: DC-DC converter phase. - * @dc_dc_freq: DC-DC converter frequency. - * @dc_dc_maxv: DC-DC maximum allowed boost voltage. - * @dac.mode: The mode to be used for the DAC output. - * @dac.ext_current_sense_resistor: Whether an external current sense resistor - * is used. - * @dac.enable_voltage_overrange: Whether to enable 20% voltage output overrange. - * @dac.slew.enable: Whether to enable digital slew. - * @dac.slew.rate: Slew rate of the digital slew. - * @dac.slew.step_size: Slew step size of the digital slew. - **/ -struct ad5755_platform_data { - bool ext_dc_dc_compenstation_resistor; - enum ad5755_dc_dc_phase dc_dc_phase; - enum ad5755_dc_dc_freq dc_dc_freq; - enum ad5755_dc_dc_maxv dc_dc_maxv; - - struct { - enum ad5755_mode mode; - bool ext_current_sense_resistor; - bool enable_voltage_overrange; - struct { - bool enable; - enum ad5755_slew_rate rate; - enum ad5755_slew_step_size step_size; - } slew; - } dac[4]; -}; - -#endif -- cgit v1.2.3 From c537be0bfad6337f2afd618fe252c03217191405 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 3 Dec 2021 11:28:46 +0100 Subject: i2c: acpi: Add i2c_acpi_new_device_by_fwnode() function Change i2c_acpi_new_device() into i2c_acpi_new_device_by_fwnode() and add a static inline wrapper providing the old i2c_acpi_new_device() behavior. This is necessary because in some cases we may only have access to the fwnode / acpi_device and not to the matching physical-node struct device *. Suggested-by: Andy Shevchenko Reviewed-by: Andy Shevchenko Acked-by: Mika Westerberg Acked-by: Wolfram Sang Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20211203102857.44539-4-hdegoede@redhat.com --- include/linux/i2c.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 16119ac1aa97..7d4f52ceb7b5 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -1025,8 +1025,9 @@ bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares, struct acpi_resource_i2c_serialbus **i2c); int i2c_acpi_client_count(struct acpi_device *adev); u32 i2c_acpi_find_bus_speed(struct device *dev); -struct i2c_client *i2c_acpi_new_device(struct device *dev, int index, - struct i2c_board_info *info); +struct i2c_client *i2c_acpi_new_device_by_fwnode(struct fwnode_handle *fwnode, + int index, + struct i2c_board_info *info); struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle); bool i2c_acpi_waive_d0_probe(struct device *dev); #else @@ -1043,8 +1044,9 @@ static inline u32 i2c_acpi_find_bus_speed(struct device *dev) { return 0; } -static inline struct i2c_client *i2c_acpi_new_device(struct device *dev, - int index, struct i2c_board_info *info) +static inline struct i2c_client *i2c_acpi_new_device_by_fwnode( + struct fwnode_handle *fwnode, int index, + struct i2c_board_info *info) { return ERR_PTR(-ENODEV); } @@ -1058,4 +1060,11 @@ static inline bool i2c_acpi_waive_d0_probe(struct device *dev) } #endif /* CONFIG_ACPI */ +static inline struct i2c_client *i2c_acpi_new_device(struct device *dev, + int index, + struct i2c_board_info *info) +{ + return i2c_acpi_new_device_by_fwnode(dev_fwnode(dev), index, info); +} + #endif /* _LINUX_I2C_H */ -- cgit v1.2.3 From 9dfa374cc6d04d2515adc21c39e356b64ee45a29 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 3 Dec 2021 11:28:47 +0100 Subject: platform_data: Add linux/platform_data/tps68470.h file The clk and regulator frameworks expect clk/regulator consumer-devices to have info about the consumed clks/regulators described in the device's fw_node. To work around cases where this info is not present in the firmware tables, which is often the case on x86/ACPI devices, both frameworks allow the provider-driver to attach info about consumers to the provider-device during probe/registration of the provider device. The TI TPS68470 PMIC is used x86/ACPI devices with the consumer-info missing from the ACPI tables. Thus the tps68470-clk and tps68470-regulator drivers must provide the consumer-info at probe time. Define tps68470_clk_platform_data and tps68470_regulator_platform_data structs to allow the x86 platform code to pass the necessary consumer info to these drivers. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20211203102857.44539-5-hdegoede@redhat.com --- include/linux/platform_data/tps68470.h | 35 ++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 include/linux/platform_data/tps68470.h (limited to 'include/linux') diff --git a/include/linux/platform_data/tps68470.h b/include/linux/platform_data/tps68470.h new file mode 100644 index 000000000000..126d082c3f2e --- /dev/null +++ b/include/linux/platform_data/tps68470.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * TI TPS68470 PMIC platform data definition. + * + * Copyright (c) 2021 Red Hat Inc. + * + * Red Hat authors: + * Hans de Goede + */ +#ifndef __PDATA_TPS68470_H +#define __PDATA_TPS68470_H + +enum tps68470_regulators { + TPS68470_CORE, + TPS68470_ANA, + TPS68470_VCM, + TPS68470_VIO, + TPS68470_VSIO, + TPS68470_AUX1, + TPS68470_AUX2, + TPS68470_NUM_REGULATORS +}; + +struct regulator_init_data; + +struct tps68470_regulator_platform_data { + const struct regulator_init_data *reg_init_data[TPS68470_NUM_REGULATORS]; +}; + +struct tps68470_clk_platform_data { + const char *consumer_dev_name; + const char *consumer_con_id; +}; + +#endif -- cgit v1.2.3 From 3c118547f87e930d45a5787e386734015dd93b32 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 10 Dec 2021 21:29:59 +0100 Subject: u64_stats: Disable preemption on 32bit UP+SMP PREEMPT_RT during updates. On PREEMPT_RT the seqcount_t for synchronisation is required on 32bit architectures even on UP because the softirq (and the threaded IRQ handler) can be preempted. With the seqcount_t for synchronisation, a reader with higher priority can preempt the writer and then spin endlessly in read_seqcount_begin() while the writer can't make progress. To avoid such a lock up on PREEMPT_RT the writer must disable preemption during the update. There is no need to disable interrupts because no writer is using this API in hard-IRQ context on PREEMPT_RT. Disable preemption on 32bit-RT within the u64_stats write section. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller --- include/linux/u64_stats_sync.h | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index e8ec116c916b..6ad4e9032d53 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -66,7 +66,7 @@ #include struct u64_stats_sync { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) seqcount_t seq; #endif }; @@ -125,7 +125,7 @@ static inline void u64_stats_inc(u64_stats_t *p) } #endif -#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) #define u64_stats_init(syncp) seqcount_init(&(syncp)->seq) #else static inline void u64_stats_init(struct u64_stats_sync *syncp) @@ -135,15 +135,19 @@ static inline void u64_stats_init(struct u64_stats_sync *syncp) static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); write_seqcount_begin(&syncp->seq); #endif } static inline void u64_stats_update_end(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) write_seqcount_end(&syncp->seq); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); #endif } @@ -152,8 +156,11 @@ u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) { unsigned long flags = 0; -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - local_irq_save(flags); +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + else + local_irq_save(flags); write_seqcount_begin(&syncp->seq); #endif return flags; @@ -163,15 +170,18 @@ static inline void u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, unsigned long flags) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) write_seqcount_end(&syncp->seq); - local_irq_restore(flags); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); + else + local_irq_restore(flags); #endif } static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) return read_seqcount_begin(&syncp->seq); #else return 0; @@ -180,7 +190,7 @@ static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync * static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) preempt_disable(); #endif return __u64_stats_fetch_begin(syncp); @@ -189,7 +199,7 @@ static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *sy static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) return read_seqcount_retry(&syncp->seq, start); #else return false; @@ -199,7 +209,7 @@ static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) preempt_enable(); #endif return __u64_stats_fetch_retry(syncp, start); @@ -213,7 +223,9 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, */ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) + preempt_disable(); +#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) local_irq_disable(); #endif return __u64_stats_fetch_begin(syncp); @@ -222,7 +234,9 @@ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) + preempt_enable(); +#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) local_irq_enable(); #endif return __u64_stats_fetch_retry(syncp, start); -- cgit v1.2.3 From 4bc5e64e6cf37007e436970024e5998ee0935651 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 26 Nov 2021 01:13:32 +0100 Subject: efi: Move efifb_setup_from_dmi() prototype from arch headers Commit 8633ef82f101 ("drivers/firmware: consolidate EFI framebuffer setup for all arches") made the Generic System Framebuffers (sysfb) driver able to be built on non-x86 architectures. But it left the efifb_setup_from_dmi() function prototype declaration in the architecture specific headers. This could lead to the following compiler warning as reported by the kernel test robot: drivers/firmware/efi/sysfb_efi.c:70:6: warning: no previous prototype for function 'efifb_setup_from_dmi' [-Wmissing-prototypes] void efifb_setup_from_dmi(struct screen_info *si, const char *opt) ^ drivers/firmware/efi/sysfb_efi.c:70:1: note: declare 'static' if the function is not intended to be used outside of this translation unit void efifb_setup_from_dmi(struct screen_info *si, const char *opt) Fixes: 8633ef82f101 ("drivers/firmware: consolidate EFI framebuffer setup for all arches") Reported-by: kernel test robot Cc: # 5.15.x Signed-off-by: Javier Martinez Canillas Acked-by: Thomas Zimmermann Link: https://lore.kernel.org/r/20211126001333.555514-1-javierm@redhat.com Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index dbd39b20e034..ef8dbc0a1522 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1283,4 +1283,10 @@ static inline struct efi_mokvar_table_entry *efi_mokvar_entry_find( } #endif +#ifdef CONFIG_SYSFB +extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt); +#else +static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) { } +#endif + #endif /* _LINUX_EFI_H */ -- cgit v1.2.3 From f92c1e183604c20ce00eb889315fdaa8f2d9e509 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 8 Dec 2021 20:32:44 +0100 Subject: bpf: Add get_func_[arg|ret|arg_cnt] helpers Adding following helpers for tracing programs: Get n-th argument of the traced function: long bpf_get_func_arg(void *ctx, u32 n, u64 *value) Get return value of the traced function: long bpf_get_func_ret(void *ctx, u64 *value) Get arguments count of the traced function: long bpf_get_func_arg_cnt(void *ctx) The trampoline now stores number of arguments on ctx-8 address, so it's easy to verify argument index and find return value argument's position. Moving function ip address on the trampoline stack behind the number of functions arguments, so it's now stored on ctx-16 address if it's needed. All helpers above are inlined by verifier. Also bit unrelated small change - using newly added function bpf_prog_has_trampoline in check_get_func_ip. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211208193245.172141-5-jolsa@kernel.org --- include/linux/bpf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7a40022e3d00..965fffaf0308 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -777,6 +777,7 @@ void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); int bpf_jit_charge_modmem(u32 pages); void bpf_jit_uncharge_modmem(u32 pages); +bool bpf_prog_has_trampoline(const struct bpf_prog *prog); #else static inline int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) @@ -805,6 +806,10 @@ static inline bool is_bpf_image_address(unsigned long address) { return false; } +static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog) +{ + return false; +} #endif struct bpf_func_info_aux { -- cgit v1.2.3 From 0e25498f8cd43c1b5aa327f373dd094e9a006da7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 28 Jun 2021 14:52:01 -0500 Subject: exit: Add and use make_task_dead. There are two big uses of do_exit. The first is it's design use to be the guts of the exit(2) system call. The second use is to terminate a task after something catastrophic has happened like a NULL pointer in kernel code. Add a function make_task_dead that is initialy exactly the same as do_exit to cover the cases where do_exit is called to handle catastrophic failure. In time this can probably be reduced to just a light wrapper around do_task_dead. For now keep it exactly the same so that there will be no behavioral differences introducing this new concept. Replace all of the uses of do_exit that use it for catastraphic task cleanup with make_task_dead to make it clear what the code is doing. As part of this rename rewind_stack_do_exit rewind_stack_and_make_dead. Signed-off-by: "Eric W. Biederman" --- include/linux/sched/task.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ba88a6987400..2d4bbd9c3278 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -59,6 +59,7 @@ extern void sched_post_fork(struct task_struct *p, extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); +void __noreturn make_task_dead(int signr); extern void proc_caches_init(void); -- cgit v1.2.3 From bbda86e988d4c124e4cfa816291cbd583ae8bfb1 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 Nov 2021 10:27:36 -0600 Subject: exit: Implement kthread_exit The way the per task_struct exit_code is used by kernel threads is not quite compatible how it is used by userspace applications. The low byte of the userspace exit_code value encodes the exit signal. While kthreads just use the value as an int holding ordinary kernel function exit status like -EPERM. Add kthread_exit to clearly separate the two kinds of uses. Signed-off-by: "Eric W. Biederman" --- include/linux/kthread.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 346b0f269161..22c43d419687 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -70,6 +70,7 @@ void *kthread_probe_data(struct task_struct *k); int kthread_park(struct task_struct *k); void kthread_unpark(struct task_struct *k); void kthread_parkme(void); +void kthread_exit(long result) __noreturn; int kthreadd(void *unused); extern struct task_struct *kthreadd_task; -- cgit v1.2.3 From ca3574bd653aba234a4b31955f2778947403be16 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 3 Dec 2021 11:00:19 -0600 Subject: exit: Rename module_put_and_exit to module_put_and_kthread_exit Update module_put_and_exit to call kthread_exit instead of do_exit. Change the name to reflect this change in functionality. All of the users of module_put_and_exit are causing the current kthread to exit so this change makes it clear what is happening. There is no functional change. Signed-off-by: "Eric W. Biederman" --- include/linux/module.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index c9f1200b2312..f03be97e9ec1 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -595,9 +595,9 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, /* Look for this name: can be of form module:name. */ unsigned long module_kallsyms_lookup_name(const char *name); -extern void __noreturn __module_put_and_exit(struct module *mod, +extern void __noreturn __module_put_and_kthread_exit(struct module *mod, long code); -#define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code) +#define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code) #ifdef CONFIG_MODULE_UNLOAD int module_refcount(struct module *mod); @@ -790,7 +790,7 @@ static inline int unregister_module_notifier(struct notifier_block *nb) return 0; } -#define module_put_and_exit(code) do_exit(code) +#define module_put_and_kthread_exit(code) kthread_exit(code) static inline void print_modules(void) { -- cgit v1.2.3 From cead18552660702a4a46f58e65188fe5f36e9dfe Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 Nov 2021 11:15:19 -0600 Subject: exit: Rename complete_and_exit to kthread_complete_and_exit Update complete_and_exit to call kthread_exit instead of do_exit. Change the name to reflect this change in functionality. All of the users of complete_and_exit are causing the current kthread to exit so this change makes it clear what is happening. Move the implementation of kthread_complete_and_exit from kernel/exit.c to to kernel/kthread.c. As this function is kthread specific it makes most sense to live with the kthread functions. There are no functional change. Signed-off-by: "Eric W. Biederman" --- include/linux/kernel.h | 1 - include/linux/kthread.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 77755ac3e189..055eb203c00e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -187,7 +187,6 @@ static inline void might_fault(void) { } #endif void do_exit(long error_code) __noreturn; -void complete_and_exit(struct completion *, long) __noreturn; extern int num_to_str(char *buf, int size, unsigned long long num, unsigned int width); diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 22c43d419687..d86a7e3b9a52 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -71,6 +71,7 @@ int kthread_park(struct task_struct *k); void kthread_unpark(struct task_struct *k); void kthread_parkme(void); void kthread_exit(long result) __noreturn; +void kthread_complete_and_exit(struct completion *, long) __noreturn; int kthreadd(void *unused); extern struct task_struct *kthreadd_task; -- cgit v1.2.3 From 40966e316f86b8cfd83abd31ccb4df729309d3e7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 2 Dec 2021 09:56:14 -0600 Subject: kthread: Ensure struct kthread is present for all kthreads Today the rules are a bit iffy and arbitrary about which kernel threads have struct kthread present. Both idle threads and thread started with create_kthread want struct kthread present so that is effectively all kernel threads. Make the rule that if PF_KTHREAD and the task is running then struct kthread is present. This will allow the kernel thread code to using tsk->exit_code with different semantics from ordinary processes. To make ensure that struct kthread is present for all kernel threads move it's allocation into copy_process. Add a deallocation of struct kthread in exec for processes that were kernel threads. Move the allocation of struct kthread for the initial thread earlier so that it is not repeated for each additional idle thread. Move the initialization of struct kthread into set_kthread_struct so that the structure is always and reliably initailized. Clear set_child_tid in free_kthread_struct to ensure the kthread struct is reliably freed during exec. The function free_kthread_struct does not need to clear vfork_done during exec as exec_mm_release called from exec_mmap has already cleared vfork_done. Signed-off-by: "Eric W. Biederman" --- include/linux/kthread.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index d86a7e3b9a52..4f3433afb54b 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -33,7 +33,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), unsigned int cpu, const char *namefmt); -void set_kthread_struct(struct task_struct *p); +bool set_kthread_struct(struct task_struct *p); void kthread_set_per_cpu(struct task_struct *k, int cpu); bool kthread_is_per_cpu(struct task_struct *k); -- cgit v1.2.3 From df5e49c880ea0776806b8a9f8ab95e035272cf6f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: SUNRPC: change svc_get() to return the svc. It is common for 'get' functions to return the object that was 'got', and there are a couple of places where users of svc_get() would be a little simpler if svc_get() did that. Make it so. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 0ae28ae6caf2..5d9568953fcd 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -120,9 +120,10 @@ struct svc_serv { * change the number of threads. Horrible, but there it is. * Should be called with the "service mutex" held. */ -static inline void svc_get(struct svc_serv *serv) +static inline struct svc_serv *svc_get(struct svc_serv *serv) { serv->sv_nrthreads++; + return serv; } /* -- cgit v1.2.3 From 8c62d12740a1450d2e8456d5747f440e10db281a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: SUNRPC/NFSD: clean up get/put functions. svc_destroy() is poorly named - it doesn't necessarily destroy the svc, it might just reduce the ref count. nfsd_destroy() is poorly named for the same reason. This patch: - removes the refcount functionality from svc_destroy(), moving it to a new svc_put(). Almost all previous callers of svc_destroy() now call svc_put(). - renames nfsd_destroy() to nfsd_put() and improves the code, using the new svc_destroy() rather than svc_put() - removes a few comments that explain the important for balanced get/put calls. This should be obvious. The only non-trivial part of this is that svc_destroy() would call svc_sock_update() on a non-final decrement. It can no longer do that, and svc_put() isn't really a good place of it. This call is now made from svc_exit_thread() which seems like a good place. This makes the call *before* sv_nrthreads is decremented rather than after. This is not particularly important as the call just sets a flag which causes sv_nrthreads set be checked later. A subsequent patch will improve the ordering. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 5d9568953fcd..73d56d33a36d 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -114,8 +114,13 @@ struct svc_serv { #endif /* CONFIG_SUNRPC_BACKCHANNEL */ }; -/* - * We use sv_nrthreads as a reference count. svc_destroy() drops +/** + * svc_get() - increment reference count on a SUNRPC serv + * @serv: the svc_serv to have count incremented + * + * Returns: the svc_serv that was passed in. + * + * We use sv_nrthreads as a reference count. svc_put() drops * this refcount, so we need to bump it up around operations that * change the number of threads. Horrible, but there it is. * Should be called with the "service mutex" held. @@ -126,6 +131,22 @@ static inline struct svc_serv *svc_get(struct svc_serv *serv) return serv; } +void svc_destroy(struct svc_serv *serv); + +/** + * svc_put - decrement reference count on a SUNRPC serv + * @serv: the svc_serv to have count decremented + * + * When the reference count reaches zero, svc_destroy() + * is called to clean up and free the serv. + */ +static inline void svc_put(struct svc_serv *serv) +{ + serv->sv_nrthreads -= 1; + if (serv->sv_nrthreads == 0) + svc_destroy(serv); +} + /* * Maximum payload size supported by a kernel RPC server. * This is use to determine the max number of pages nfsd is @@ -515,7 +536,6 @@ struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); int svc_set_num_threads_sync(struct svc_serv *, struct svc_pool *, int); int svc_pool_stats_open(struct svc_serv *serv, struct file *file); -void svc_destroy(struct svc_serv *); void svc_shutdown_net(struct svc_serv *, struct net *); int svc_process(struct svc_rqst *); int bc_svc_process(struct svc_serv *, struct rpc_rqst *, -- cgit v1.2.3 From ec52361df99b490f6af412b046df9799b92c1050 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: SUNRPC: stop using ->sv_nrthreads as a refcount The use of sv_nrthreads as a general refcount results in clumsy code, as is seen by various comments needed to explain the situation. This patch introduces a 'struct kref' and uses that for reference counting, leaving sv_nrthreads to be a pure count of threads. The kref is managed particularly in svc_get() and svc_put(), and also nfsd_put(); svc_destroy() now takes a pointer to the embedded kref, rather than to the serv. nfsd allows the svc_serv to exist with ->sv_nrhtreads being zero. This happens when a transport is created before the first thread is started. To support this, a 'keep_active' flag is introduced which holds a ref on the svc_serv. This is set when any listening socket is successfully added (unless there are running threads), and cleared when the number of threads is set. So when the last thread exits, the nfs_serv will be destroyed. The use of 'keep_active' replaces previous code which checked if there were any permanent sockets. We no longer clear ->rq_server when nfsd() exits. This was done to prevent svc_exit_thread() from calling svc_destroy(). Instead we take an extra reference to the svc_serv to prevent svc_destroy() from being called. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 73d56d33a36d..3903b4ae8ac5 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -85,6 +85,7 @@ struct svc_serv { struct svc_program * sv_program; /* RPC program */ struct svc_stat * sv_stats; /* RPC statistics */ spinlock_t sv_lock; + struct kref sv_refcnt; unsigned int sv_nrthreads; /* # of server threads */ unsigned int sv_maxconn; /* max connections allowed or * '0' causing max to be based @@ -119,19 +120,14 @@ struct svc_serv { * @serv: the svc_serv to have count incremented * * Returns: the svc_serv that was passed in. - * - * We use sv_nrthreads as a reference count. svc_put() drops - * this refcount, so we need to bump it up around operations that - * change the number of threads. Horrible, but there it is. - * Should be called with the "service mutex" held. */ static inline struct svc_serv *svc_get(struct svc_serv *serv) { - serv->sv_nrthreads++; + kref_get(&serv->sv_refcnt); return serv; } -void svc_destroy(struct svc_serv *serv); +void svc_destroy(struct kref *); /** * svc_put - decrement reference count on a SUNRPC serv @@ -142,9 +138,7 @@ void svc_destroy(struct svc_serv *serv); */ static inline void svc_put(struct svc_serv *serv) { - serv->sv_nrthreads -= 1; - if (serv->sv_nrthreads == 0) - svc_destroy(serv); + kref_put(&serv->sv_refcnt, svc_destroy); } /* -- cgit v1.2.3 From 3409e4f1e8f239f0ed81be0b068ecf4e73e2e826 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: NFSD: Make it possible to use svc_set_num_threads_sync nfsd cannot currently use svc_set_num_threads_sync. It instead uses svc_set_num_threads which does *not* wait for threads to all exit, and has a separate mechanism (nfsd_shutdown_complete) to wait for completion. The reason that nfsd is unlike other services is that nfsd threads can exit separately from svc_set_num_threads being called - they die on receipt of SIGKILL. Also, when the last thread exits, the service must be shut down (sockets closed). For this, the nfsd_mutex needs to be taken, and as that mutex needs to be held while svc_set_num_threads is called, the one cannot wait for the other. This patch changes the nfsd thread so that it can drop the ref on the service without blocking on nfsd_mutex, so that svc_set_num_threads_sync can be used: - if it can drop a non-last reference, it does that. This does not trigger shutdown and does not require a mutex. This will likely happen for all but the last thread signalled, and for all threads being shut down by nfsd_shutdown_threads() - if it can get the mutex without blocking (trylock), it does that and then drops the reference. This will likely happen for the last thread killed by SIGKILL - Otherwise there might be an unrelated task holding the mutex, possibly in another network namespace, or nfsd_shutdown_threads() might be just about to get a reference on the service, after which we can drop ours safely. We cannot conveniently get wakeup notifications on these events, and we are unlikely to need to, so we sleep briefly and check again. With this we can discard nfsd_shutdown_complete and nfsd_complete_shutdown(), and switch to svc_set_num_threads_sync. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 3903b4ae8ac5..36bfc0281988 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -141,6 +141,19 @@ static inline void svc_put(struct svc_serv *serv) kref_put(&serv->sv_refcnt, svc_destroy); } +/** + * svc_put_not_last - decrement non-final reference count on SUNRPC serv + * @serv: the svc_serv to have count decremented + * + * Returns: %true is refcount was decremented. + * + * If the refcount is 1, it is not decremented and instead failure is reported. + */ +static inline bool svc_put_not_last(struct svc_serv *serv) +{ + return refcount_dec_not_one(&serv->sv_refcnt.refcount); +} + /* * Maximum payload size supported by a kernel RPC server. * This is use to determine the max number of pages nfsd is -- cgit v1.2.3 From 3ebdbe5203a874614819700d3f470724cb803709 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: SUNRPC: discard svo_setup and rename svc_set_num_threads_sync() The ->svo_setup callback serves no purpose. It is always called from within the same module that chooses which callback is needed. So discard it and call the relevant function directly. Now that svc_set_num_threads() is no longer used remove it and rename svc_set_num_threads_sync() to remove the "_sync" suffix. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 36bfc0281988..0b38c6eaf985 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -64,9 +64,6 @@ struct svc_serv_ops { /* queue up a transport for servicing */ void (*svo_enqueue_xprt)(struct svc_xprt *); - /* set up thread (or whatever) execution context */ - int (*svo_setup)(struct svc_serv *, struct svc_pool *, int); - /* optional module to count when adding threads (pooled svcs only) */ struct module *svo_module; }; @@ -541,7 +538,6 @@ void svc_pool_map_put(void); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, const struct svc_serv_ops *); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); -int svc_set_num_threads_sync(struct svc_serv *, struct svc_pool *, int); int svc_pool_stats_open(struct svc_serv *serv, struct file *file); void svc_shutdown_net(struct svc_serv *, struct net *); int svc_process(struct svc_rqst *); -- cgit v1.2.3 From cf0e124e0a489944d08fcc3c694d2b234d2cc658 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: SUNRPC: move the pool_map definitions (back) into svc.c These definitions are not used outside of svc.c, and there is no evidence that they ever have been. So move them into svc.c and make the declarations 'static'. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 0b38c6eaf985..d69e6108cb83 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -494,29 +494,6 @@ struct svc_procedure { const char * pc_name; /* for display */ }; -/* - * Mode for mapping cpus to pools. - */ -enum { - SVC_POOL_AUTO = -1, /* choose one of the others */ - SVC_POOL_GLOBAL, /* no mapping, just a single global pool - * (legacy & UP mode) */ - SVC_POOL_PERCPU, /* one pool per cpu */ - SVC_POOL_PERNODE /* one pool per numa node */ -}; - -struct svc_pool_map { - int count; /* How many svc_servs use us */ - int mode; /* Note: int not enum to avoid - * warnings about "enumeration value - * not handled in switch" */ - unsigned int npools; - unsigned int *pool_to; /* maps pool id to cpu or node */ - unsigned int *to_pool; /* maps cpu or node to pool id */ -}; - -extern struct svc_pool_map svc_pool_map; - /* * Function prototypes. */ @@ -533,8 +510,6 @@ void svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page); void svc_rqst_free(struct svc_rqst *); void svc_exit_thread(struct svc_rqst *); -unsigned int svc_pool_map_get(void); -void svc_pool_map_put(void); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, const struct svc_serv_ops *); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); -- cgit v1.2.3 From 6b044fbaab02292fedb17565dbb3f2528083b169 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: lockd: use svc_set_num_threads() for thread start and stop svc_set_num_threads() does everything that lockd_start_svc() does, except set sv_maxconn. It also (when passed 0) finds the threads and stops them with kthread_stop(). So move the setting for sv_maxconn, and use svc_set_num_thread() We now don't need nlmsvc_task. Now that we use svc_set_num_threads() it makes sense to set svo_module. This request that the thread exists with module_put_and_exit(). Also fix the documentation for svo_module to make this explicit. svc_prepare_thread is now only used where it is defined, so it can be made static. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index d69e6108cb83..cf175d47c6b7 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -64,7 +64,9 @@ struct svc_serv_ops { /* queue up a transport for servicing */ void (*svo_enqueue_xprt)(struct svc_xprt *); - /* optional module to count when adding threads (pooled svcs only) */ + /* optional module to count when adding threads. + * Thread function must call module_put_and_exit() to exit. + */ struct module *svo_module; }; @@ -504,8 +506,6 @@ struct svc_serv *svc_create(struct svc_program *, unsigned int, const struct svc_serv_ops *); struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node); -struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, - struct svc_pool *pool, int node); void svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page); void svc_rqst_free(struct svc_rqst *); -- cgit v1.2.3 From c8064e5b4adac5e1255cf4f3b374e75b5376e7ca Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 30 Nov 2021 11:08:07 +0100 Subject: bpf: Let bpf_warn_invalid_xdp_action() report more info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In non trivial scenarios, the action id alone is not sufficient to identify the program causing the warning. Before the previous patch, the generated stack-trace pointed out at least the involved device driver. Let's additionally include the program name and id, and the relevant device name. If the user needs additional infos, he can fetch them via a kernel probe, leveraging the arguments added here. Signed-off-by: Paolo Abeni Signed-off-by: Daniel Borkmann Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/ddb96bb975cbfddb1546cf5da60e77d5100b533c.1638189075.git.pabeni@redhat.com --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 68c6b5c208e7..60eec80fa1d4 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1027,7 +1027,7 @@ void xdp_do_flush(void); */ #define xdp_do_flush_map xdp_do_flush -void bpf_warn_invalid_xdp_action(u32 act); +void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act); #ifdef CONFIG_INET struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, -- cgit v1.2.3 From 22c3f2f56bd9c901e1da69040680c311f310635d Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Wed, 1 Dec 2021 11:36:18 -0800 Subject: net/mlx5: Separate FDB namespace This patch doesn't add an additional namespaces, but just separates the naming to be used by each FDB user, bypass and kernel. Downstream patches will actually split this up and allow to have more than single priority for the bypass users. Signed-off-by: Maor Gottlieb Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed Acked-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index cd2d4c572367..b1aad14689e3 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -73,6 +73,7 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_KERNEL, MLX5_FLOW_NAMESPACE_LEFTOVERS, MLX5_FLOW_NAMESPACE_ANCHOR, + MLX5_FLOW_NAMESPACE_FDB_BYPASS, MLX5_FLOW_NAMESPACE_FDB, MLX5_FLOW_NAMESPACE_ESW_EGRESS, MLX5_FLOW_NAMESPACE_ESW_INGRESS, -- cgit v1.2.3 From 8aa45b544db9788b0fcc7a300eba8a3ade5d3d50 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Fri, 10 Dec 2021 13:11:34 +0200 Subject: HID: Add map_msc() to avoid boilerplate code Since we are going to have more MSC events too, add map_msc() that can be used to fill in necessary fields and avoid boilerplate code. Signed-off-by: Mika Westerberg Reviewed-by: Benjamin Tissoires Signed-off-by: Tero Kristo Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20211210111138.1248187-2-tero.kristo@linux.intel.com --- include/linux/hid.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index b2fea7fc54a1..f18e2cf5d74d 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1010,6 +1010,10 @@ static inline void hid_map_usage(struct hid_input *hidinput, bmap = input->ledbit; limit = LED_MAX; break; + case EV_MSC: + bmap = input->mscbit; + limit = MSC_MAX; + break; } if (unlikely(c > limit || !bmap)) { -- cgit v1.2.3 From ae7fafa6896ae762ff489a5e71c8e5fabfeb61ee Mon Sep 17 00:00:00 2001 From: Tero Kristo Date: Fri, 10 Dec 2021 13:11:36 +0200 Subject: HID: Add hid usages for USI style pens Add usage codes for USI style pens, based on the USB-HID usage table: https://usb.org/document-library/hid-usage-tables-122 See chapter 16, Digitizers Page (0x0D) Signed-off-by: Tero Kristo Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20211210111138.1248187-4-tero.kristo@linux.intel.com --- include/linux/hid.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index f18e2cf5d74d..fa2ddda84a53 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -241,6 +241,7 @@ struct hid_item { #define HID_DG_TOUCH 0x000d0033 #define HID_DG_UNTOUCH 0x000d0034 #define HID_DG_TAP 0x000d0035 +#define HID_DG_TRANSDUCER_INDEX 0x000d0038 #define HID_DG_TABLETFUNCTIONKEY 0x000d0039 #define HID_DG_PROGRAMCHANGEKEY 0x000d003a #define HID_DG_BATTERYSTRENGTH 0x000d003b @@ -253,6 +254,15 @@ struct hid_item { #define HID_DG_BARRELSWITCH 0x000d0044 #define HID_DG_ERASER 0x000d0045 #define HID_DG_TABLETPICK 0x000d0046 +#define HID_DG_PEN_COLOR 0x000d005c +#define HID_DG_PEN_LINE_WIDTH 0x000d005e +#define HID_DG_PEN_LINE_STYLE 0x000d0070 +#define HID_DG_PEN_LINE_STYLE_INK 0x000d0072 +#define HID_DG_PEN_LINE_STYLE_PENCIL 0x000d0073 +#define HID_DG_PEN_LINE_STYLE_HIGHLIGHTER 0x000d0074 +#define HID_DG_PEN_LINE_STYLE_CHISEL_MARKER 0x000d0075 +#define HID_DG_PEN_LINE_STYLE_BRUSH 0x000d0076 +#define HID_DG_PEN_LINE_STYLE_NO_PREFERENCE 0x000d0077 #define HID_CP_CONSUMERCONTROL 0x000c0001 #define HID_CP_NUMERICKEYPAD 0x000c0002 -- cgit v1.2.3 From 5904a3f9d756f108279f8c5b8f17ca6ddfb28d24 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Fri, 10 Dec 2021 13:11:37 +0200 Subject: HID: input: Make hidinput_find_field() static This function is not called outside of hid-input.c so we can make it static. Signed-off-by: Mika Westerberg Reviewed-by: Benjamin Tissoires Signed-off-by: Tero Kristo Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20211210111138.1248187-5-tero.kristo@linux.intel.com --- include/linux/hid.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index fa2ddda84a53..ff8b6973022a 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -899,7 +899,6 @@ extern void hidinput_disconnect(struct hid_device *); int hid_set_field(struct hid_field *, unsigned, __s32); int hid_input_report(struct hid_device *, int type, u8 *, u32, int); -int hidinput_find_field(struct hid_device *hid, unsigned int type, unsigned int code, struct hid_field **field); struct hid_field *hidinput_get_led_field(struct hid_device *hid); unsigned int hidinput_count_leds(struct hid_device *hid); __s32 hidinput_calc_abs_res(const struct hid_field *field, __u16 code); -- cgit v1.2.3 From fd8d135b2c5e88662f2729e034913f183455a667 Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Wed, 8 Dec 2021 22:40:43 +1000 Subject: HID: quirks: Allow inverting the absolute X/Y values Add a HID_QUIRK_X_INVERT/HID_QUIRK_Y_INVERT quirk that can be used to invert the X/Y values. Signed-off-by: Alistair Francis [bentiss: silence checkpatch warning] Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20211208124045.61815-2-alistair@alistair23.me --- include/linux/hid.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index ff8b6973022a..a14c089eb0c1 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -359,6 +359,8 @@ struct hid_item { /* BIT(9) reserved for backward compatibility, was NO_INIT_INPUT_REPORTS */ #define HID_QUIRK_ALWAYS_POLL BIT(10) #define HID_QUIRK_INPUT_PER_APP BIT(11) +#define HID_QUIRK_X_INVERT BIT(12) +#define HID_QUIRK_Y_INVERT BIT(13) #define HID_QUIRK_SKIP_OUTPUT_REPORTS BIT(16) #define HID_QUIRK_SKIP_OUTPUT_REPORT_ID BIT(17) #define HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP BIT(18) -- cgit v1.2.3 From 369461ce8fb6c8156206c7110d7da48e9fbc41bb Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 8 Dec 2021 14:11:20 -0600 Subject: x86: perf: Move RDPMC event flag to a common definition In preparation to enable user counter access on arm64 and to move some of the user access handling to perf core, create a common event flag for user counter access and convert x86 to use it. Since the architecture specific flags start at the LSB, starting at the MSB for common flags. Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Kan Liang Cc: Thomas Gleixner Cc: Borislav Petkov Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: linux-perf-users@vger.kernel.org Reviewed-by: Mark Rutland Reviewed-by: Thomas Gleixner Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20211208201124.310740-2-robh@kernel.org Signed-off-by: Will Deacon --- include/linux/perf_event.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0dcfd265beed..ba9467972c09 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -129,6 +129,15 @@ struct hw_perf_event_extra { int idx; /* index in shared_regs->regs[] */ }; +/** + * hw_perf_event::flag values + * + * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific + * usage. + */ +#define PERF_EVENT_FLAG_ARCH 0x0000ffff +#define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 + /** * struct hw_perf_event - performance event hardware details: */ -- cgit v1.2.3 From 82ff0c022d19c2ad69a472692bb7ee01ac07a40b Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 8 Dec 2021 14:11:21 -0600 Subject: perf: Add a counter for number of user access events in context On arm64, user space counter access will be controlled differently compared to x86. On x86, access in the strictest mode is enabled for all tasks in an MM when any event is mmap'ed. For arm64, access is explicitly requested for an event and only enabled when the event's context is active. This avoids hooks into the arch context switch code and gives better control of when access is enabled. In order to configure user space access when the PMU is enabled, it is necessary to know if any event (currently active or not) in the current context has user space accessed enabled. Add a counter similar to other counters in the context to avoid walking the event list every time. Reviewed-by: Mark Rutland Reviewed-by: Thomas Gleixner Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20211208201124.310740-3-robh@kernel.org Signed-off-by: Will Deacon --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ba9467972c09..411e34210fbf 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -831,6 +831,7 @@ struct perf_event_context { int nr_events; int nr_active; + int nr_user; int is_active; int nr_stat; int nr_freq; -- cgit v1.2.3 From 8404b0fbc7fbd42e5c5d28cdedd450e70829c77a Mon Sep 17 00:00:00 2001 From: Qi Liu Date: Thu, 2 Dec 2021 16:06:33 +0800 Subject: drivers/perf: hisi: Add driver for HiSilicon PCIe PMU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCIe PMU Root Complex Integrated End Point(RCiEP) device is supported to sample bandwidth, latency, buffer occupation etc. Each PMU RCiEP device monitors multiple Root Ports, and each RCiEP is registered as a PMU in /sys/bus/event_source/devices, so users can select target PMU, and use filter to do further sets. Filtering options contains: event - select the event. port - select target Root Ports. Information of Root Ports are shown under sysfs. bdf - select requester_id of target EP device. trig_len - set trigger condition for starting event statistics. trig_mode - set trigger mode. 0 means starting to statistic when bigger than trigger condition, and 1 means smaller. thr_len - set threshold for statistics. thr_mode - set threshold mode. 0 means count when bigger than threshold, and 1 means smaller. Acked-by: Krzysztof Wilczyński Reviewed-by: John Garry Signed-off-by: Qi Liu Reviewed-by: Shaokun Zhang Link: https://lore.kernel.org/r/20211202080633.2919-3-liuqi115@huawei.com Signed-off-by: Will Deacon --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 773c83730906..411a428ace4d 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -225,6 +225,7 @@ enum cpuhp_state { CPUHP_AP_PERF_ARM_HISI_L3_ONLINE, CPUHP_AP_PERF_ARM_HISI_PA_ONLINE, CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE, + CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, CPUHP_AP_PERF_ARM_L2X0_ONLINE, CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE, CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE, -- cgit v1.2.3 From c8a2a011cd04054a6577d8cf774adf9e09302876 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 14 Dec 2021 03:45:35 +0200 Subject: net: dsa: sja1105: fix broken connection with the sja1110 tagger The driver was incorrectly converted assuming that "sja1105" is the only tagger supported by this driver. This results in SJA1110 switches failing to probe: sja1105 spi1.0: Unable to connect to tag protocol "sja1110": -EPROTONOSUPPORT sja1105: probe of spi1.2 failed with error -93 Add DSA_TAG_PROTO_SJA1110 to the list of supported taggers by the sja1105 driver. The sja1105_tagger_data structure format is common for the two tagging protocols. Fixes: c79e84866d2a ("net: dsa: tag_sja1105: convert to tagger-owned data") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index e9cb1ae6d742..159e43171ccc 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -70,7 +70,8 @@ struct sja1105_skb_cb { static inline struct sja1105_tagger_data * sja1105_tagger_data(struct dsa_switch *ds) { - BUG_ON(ds->dst->tag_ops->proto != DSA_TAG_PROTO_SJA1105); + BUG_ON(ds->dst->tag_ops->proto != DSA_TAG_PROTO_SJA1105 && + ds->dst->tag_ops->proto != DSA_TAG_PROTO_SJA1110); return ds->tagger_data; } -- cgit v1.2.3 From 9280ac2e6f199cddcd746a9ba459136b8666287b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Dec 2021 07:15:15 -0800 Subject: net: dev_replace_track() cleanup Use existing helpers (netdev_tracker_free() and netdev_tracker_alloc()) to remove ifdefery. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20211214151515.312535-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 235d5d082f1a..c06e9dc1a317 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3885,16 +3885,14 @@ static inline void dev_replace_track(struct net_device *odev, netdevice_tracker *tracker, gfp_t gfp) { -#ifdef CONFIG_NET_DEV_REFCNT_TRACKER if (odev) - ref_tracker_free(&odev->refcnt_tracker, tracker); -#endif + netdev_tracker_free(odev, tracker); + dev_hold(ndev); dev_put(odev); -#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + if (ndev) - ref_tracker_alloc(&ndev->refcnt_tracker, tracker, gfp); -#endif + netdev_tracker_alloc(ndev, tracker, gfp); } /* Carrier loss detection, dial on demand. The functions netif_carrier_on -- cgit v1.2.3 From ad69cd9972e79aba103ba5365de0acd35770c265 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 29 Nov 2021 22:15:27 +0200 Subject: fsnotify: clarify object type argument In preparation for separating object type from iterator type, rename some 'type' arguments in functions to 'obj_type' and remove the unused interface to clear marks by object type mask. Link: https://lore.kernel.org/r/20211129201537.1932819-2-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 51ef2b079bfa..b9c84b1dbcc8 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -338,6 +338,7 @@ static inline struct fs_error_report *fsnotify_data_error_report( } enum fsnotify_obj_type { + FSNOTIFY_OBJ_TYPE_ANY = -1, FSNOTIFY_OBJ_TYPE_INODE, FSNOTIFY_OBJ_TYPE_PARENT, FSNOTIFY_OBJ_TYPE_VFSMOUNT, @@ -346,15 +347,9 @@ enum fsnotify_obj_type { FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT }; -#define FSNOTIFY_OBJ_TYPE_INODE_FL (1U << FSNOTIFY_OBJ_TYPE_INODE) -#define FSNOTIFY_OBJ_TYPE_PARENT_FL (1U << FSNOTIFY_OBJ_TYPE_PARENT) -#define FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL (1U << FSNOTIFY_OBJ_TYPE_VFSMOUNT) -#define FSNOTIFY_OBJ_TYPE_SB_FL (1U << FSNOTIFY_OBJ_TYPE_SB) -#define FSNOTIFY_OBJ_ALL_TYPES_MASK ((1U << FSNOTIFY_OBJ_TYPE_COUNT) - 1) - -static inline bool fsnotify_valid_obj_type(unsigned int type) +static inline bool fsnotify_valid_obj_type(unsigned int obj_type) { - return (type < FSNOTIFY_OBJ_TYPE_COUNT); + return (obj_type < FSNOTIFY_OBJ_TYPE_COUNT); } struct fsnotify_iter_info { @@ -387,7 +382,7 @@ static inline void fsnotify_iter_set_report_type_mark( static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \ struct fsnotify_iter_info *iter_info) \ { \ - return (iter_info->report_mask & FSNOTIFY_OBJ_TYPE_##NAME##_FL) ? \ + return (iter_info->report_mask & (1U << FSNOTIFY_OBJ_TYPE_##NAME)) ? \ iter_info->marks[FSNOTIFY_OBJ_TYPE_##NAME] : NULL; \ } @@ -604,11 +599,11 @@ extern int fsnotify_get_conn_fsid(const struct fsnotify_mark_connector *conn, __kernel_fsid_t *fsid); /* attach the mark to the object */ extern int fsnotify_add_mark(struct fsnotify_mark *mark, - fsnotify_connp_t *connp, unsigned int type, + fsnotify_connp_t *connp, unsigned int obj_type, int allow_dups, __kernel_fsid_t *fsid); extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int type, int allow_dups, + unsigned int obj_type, int allow_dups, __kernel_fsid_t *fsid); /* attach the mark to the inode */ @@ -637,22 +632,23 @@ extern void fsnotify_detach_mark(struct fsnotify_mark *mark); extern void fsnotify_free_mark(struct fsnotify_mark *mark); /* Wait until all marks queued for destruction are destroyed */ extern void fsnotify_wait_marks_destroyed(void); -/* run all the marks in a group, and clear all of the marks attached to given object type */ -extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int type); +/* Clear all of the marks of a group attached to a given object type */ +extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, + unsigned int obj_type); /* run all the marks in a group, and clear all of the vfsmount marks */ static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) { - fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL); + fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT); } /* run all the marks in a group, and clear all of the inode marks */ static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) { - fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE_FL); + fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE); } /* run all the marks in a group, and clear all of the sn marks */ static inline void fsnotify_clear_sb_marks_by_group(struct fsnotify_group *group) { - fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_SB_FL); + fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_SB); } extern void fsnotify_get_mark(struct fsnotify_mark *mark); extern void fsnotify_put_mark(struct fsnotify_mark *mark); -- cgit v1.2.3 From 1c9007d62bea6fd164285314f7553f73e5308863 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 29 Nov 2021 22:15:28 +0200 Subject: fsnotify: separate mark iterator type from object type enum They are two different types that use the same enum, so this confusing. Use the object type to indicate the type of object mark is attached to and the iter type to indicate the type of watch. A group can have two different watches of the same object type (parent and child watches) that match the same event. Link: https://lore.kernel.org/r/20211129201537.1932819-3-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 41 +++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index b9c84b1dbcc8..73739fee1710 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -337,10 +337,25 @@ static inline struct fs_error_report *fsnotify_data_error_report( } } +/* + * Index to merged marks iterator array that correlates to a type of watch. + * The type of watched object can be deduced from the iterator type, but not + * the other way around, because an event can match different watched objects + * of the same object type. + * For example, both parent and child are watching an object of type inode. + */ +enum fsnotify_iter_type { + FSNOTIFY_ITER_TYPE_INODE, + FSNOTIFY_ITER_TYPE_VFSMOUNT, + FSNOTIFY_ITER_TYPE_SB, + FSNOTIFY_ITER_TYPE_PARENT, + FSNOTIFY_ITER_TYPE_COUNT +}; + +/* The type of object that a mark is attached to */ enum fsnotify_obj_type { FSNOTIFY_OBJ_TYPE_ANY = -1, FSNOTIFY_OBJ_TYPE_INODE, - FSNOTIFY_OBJ_TYPE_PARENT, FSNOTIFY_OBJ_TYPE_VFSMOUNT, FSNOTIFY_OBJ_TYPE_SB, FSNOTIFY_OBJ_TYPE_COUNT, @@ -353,37 +368,37 @@ static inline bool fsnotify_valid_obj_type(unsigned int obj_type) } struct fsnotify_iter_info { - struct fsnotify_mark *marks[FSNOTIFY_OBJ_TYPE_COUNT]; + struct fsnotify_mark *marks[FSNOTIFY_ITER_TYPE_COUNT]; unsigned int report_mask; int srcu_idx; }; static inline bool fsnotify_iter_should_report_type( - struct fsnotify_iter_info *iter_info, int type) + struct fsnotify_iter_info *iter_info, int iter_type) { - return (iter_info->report_mask & (1U << type)); + return (iter_info->report_mask & (1U << iter_type)); } static inline void fsnotify_iter_set_report_type( - struct fsnotify_iter_info *iter_info, int type) + struct fsnotify_iter_info *iter_info, int iter_type) { - iter_info->report_mask |= (1U << type); + iter_info->report_mask |= (1U << iter_type); } static inline void fsnotify_iter_set_report_type_mark( - struct fsnotify_iter_info *iter_info, int type, + struct fsnotify_iter_info *iter_info, int iter_type, struct fsnotify_mark *mark) { - iter_info->marks[type] = mark; - iter_info->report_mask |= (1U << type); + iter_info->marks[iter_type] = mark; + iter_info->report_mask |= (1U << iter_type); } #define FSNOTIFY_ITER_FUNCS(name, NAME) \ static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \ struct fsnotify_iter_info *iter_info) \ { \ - return (iter_info->report_mask & (1U << FSNOTIFY_OBJ_TYPE_##NAME)) ? \ - iter_info->marks[FSNOTIFY_OBJ_TYPE_##NAME] : NULL; \ + return (iter_info->report_mask & (1U << FSNOTIFY_ITER_TYPE_##NAME)) ? \ + iter_info->marks[FSNOTIFY_ITER_TYPE_##NAME] : NULL; \ } FSNOTIFY_ITER_FUNCS(inode, INODE) @@ -391,8 +406,8 @@ FSNOTIFY_ITER_FUNCS(parent, PARENT) FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT) FSNOTIFY_ITER_FUNCS(sb, SB) -#define fsnotify_foreach_obj_type(type) \ - for (type = 0; type < FSNOTIFY_OBJ_TYPE_COUNT; type++) +#define fsnotify_foreach_iter_type(type) \ + for (type = 0; type < FSNOTIFY_ITER_TYPE_COUNT; type++) /* * fsnotify_connp_t is what we embed in objects which connector can be attached -- cgit v1.2.3 From d61fd650e9d206a71fda789f02a1ced4b19944c4 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 29 Nov 2021 22:15:29 +0200 Subject: fanotify: introduce group flag FAN_REPORT_TARGET_FID FAN_REPORT_FID is ambiguous in that it reports the fid of the child for some events and the fid of the parent for create/delete/move events. The new FAN_REPORT_TARGET_FID flag is an implicit request to report the fid of the target object of the operation (a.k.a the child inode) also in create/delete/move events in addition to the fid of the parent and the name of the child. To reduce the test matrix for uninteresting use cases, the new FAN_REPORT_TARGET_FID flag requires both FAN_REPORT_NAME and FAN_REPORT_FID. The convenience macro FAN_REPORT_DFID_NAME_TARGET combines FAN_REPORT_TARGET_FID with all the required flags. Link: https://lore.kernel.org/r/20211129201537.1932819-4-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/fanotify.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 616af2ea20f3..376e050e6f38 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -25,7 +25,7 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */ #define FANOTIFY_CLASS_BITS (FAN_CLASS_NOTIF | FANOTIFY_PERM_CLASSES) -#define FANOTIFY_FID_BITS (FAN_REPORT_FID | FAN_REPORT_DFID_NAME) +#define FANOTIFY_FID_BITS (FAN_REPORT_DFID_NAME_TARGET) #define FANOTIFY_INFO_MODES (FANOTIFY_FID_BITS | FAN_REPORT_PIDFD) -- cgit v1.2.3 From e54183fa7047c15819bc155f4c58501d9a9a3489 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 29 Nov 2021 22:15:30 +0200 Subject: fsnotify: generate FS_RENAME event with rich information The dnotify FS_DN_RENAME event is used to request notification about a move within the same parent directory and was always coupled with the FS_MOVED_FROM event. Rename the FS_DN_RENAME event flag to FS_RENAME, decouple it from FS_MOVED_FROM and report it with the moved dentry instead of the moved inode, so it has the information about both old and new parent and name. Generate the FS_RENAME event regardless of same parent dir and apply the "same parent" rule in the generic fsnotify_handle_event() helper that is used to call backends with ->handle_inode_event() method (i.e. dnotify). The ->handle_inode_event() method is not rich enough to report both old and new parent and name anyway. The enriched event is reported to fanotify over the ->handle_event() method with the old and new dir inode marks in marks array slots for ITER_TYPE_INODE and a new iter type slot ITER_TYPE_INODE2. The enriched event will be used for reporting old and new parent+name to fanotify groups with FAN_RENAME events. Link: https://lore.kernel.org/r/20211129201537.1932819-5-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/dnotify.h | 2 +- include/linux/fsnotify.h | 9 ++++++--- include/linux/fsnotify_backend.h | 7 ++++--- 3 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h index 0aad774beaec..b87c3b85a166 100644 --- a/include/linux/dnotify.h +++ b/include/linux/dnotify.h @@ -26,7 +26,7 @@ struct dnotify_struct { FS_MODIFY | FS_MODIFY_CHILD |\ FS_ACCESS | FS_ACCESS_CHILD |\ FS_ATTRIB | FS_ATTRIB_CHILD |\ - FS_CREATE | FS_DN_RENAME |\ + FS_CREATE | FS_RENAME |\ FS_MOVED_FROM | FS_MOVED_TO) extern int dir_notify_enable; diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 787545e87eeb..3a2d7dc3c607 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -144,16 +144,19 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, u32 fs_cookie = fsnotify_get_cookie(); __u32 old_dir_mask = FS_MOVED_FROM; __u32 new_dir_mask = FS_MOVED_TO; + __u32 rename_mask = FS_RENAME; const struct qstr *new_name = &moved->d_name; - if (old_dir == new_dir) - old_dir_mask |= FS_DN_RENAME; - if (isdir) { old_dir_mask |= FS_ISDIR; new_dir_mask |= FS_ISDIR; + rename_mask |= FS_ISDIR; } + /* Event with information about both old and new parent+name */ + fsnotify_name(rename_mask, moved, FSNOTIFY_EVENT_DENTRY, + old_dir, old_name, 0); + fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE, old_dir, old_name, fs_cookie); fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE, diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 73739fee1710..790c31844db5 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -63,7 +63,7 @@ */ #define FS_EVENT_ON_CHILD 0x08000000 -#define FS_DN_RENAME 0x10000000 /* file renamed */ +#define FS_RENAME 0x10000000 /* File was renamed */ #define FS_DN_MULTISHOT 0x20000000 /* dnotify multishot */ #define FS_ISDIR 0x40000000 /* event occurred against dir */ #define FS_IN_ONESHOT 0x80000000 /* only send event once */ @@ -76,7 +76,7 @@ * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event * when a directory entry inside a child subdir changes. */ -#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE) +#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME) #define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \ FS_OPEN_EXEC_PERM) @@ -101,7 +101,7 @@ /* Events that can be reported to backends */ #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \ FS_EVENTS_POSS_ON_CHILD | \ - FS_DELETE_SELF | FS_MOVE_SELF | FS_DN_RENAME | \ + FS_DELETE_SELF | FS_MOVE_SELF | \ FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \ FS_ERROR) @@ -349,6 +349,7 @@ enum fsnotify_iter_type { FSNOTIFY_ITER_TYPE_VFSMOUNT, FSNOTIFY_ITER_TYPE_SB, FSNOTIFY_ITER_TYPE_PARENT, + FSNOTIFY_ITER_TYPE_INODE2, FSNOTIFY_ITER_TYPE_COUNT }; -- cgit v1.2.3 From 8cc3b1ccd930fe6971e1527f0c4f1bdc8cb56026 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 29 Nov 2021 22:15:37 +0200 Subject: fanotify: wire up FAN_RENAME event FAN_RENAME is the successor of FAN_MOVED_FROM and FAN_MOVED_TO and can be used to get the old and new parent+name information in a single event. FAN_MOVED_FROM and FAN_MOVED_TO are still supported for backward compatibility, but it makes little sense to use them together with FAN_RENAME in the same group. FAN_RENAME uses special info type records to report the old and new parent+name, so reporting only old and new parent id is less useful and was not implemented. Therefore, FAN_REANAME requires a group with flag FAN_REPORT_NAME. Link: https://lore.kernel.org/r/20211129201537.1932819-12-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/fanotify.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 376e050e6f38..3afdf339d53c 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -82,7 +82,8 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */ * Directory entry modification events - reported only to directory * where entry is modified and not to a watching parent. */ -#define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE) +#define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE | \ + FAN_RENAME) /* Events that can be reported with event->fd */ #define FANOTIFY_FD_EVENTS (FANOTIFY_PATH_EVENTS | FANOTIFY_PERM_EVENTS) -- cgit v1.2.3 From f1d9268e061863ead77b07f5a6807d063e28a1c2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Dec 2021 07:09:33 -0800 Subject: net: add net device refcount tracker to struct packet_type Most notable changes are in af_packet, tipc ones are trivial. Signed-off-by: Eric Dumazet Cc: Jon Maloy Cc: Ying Xue Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c06e9dc1a317..a419718612c6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2533,6 +2533,7 @@ struct packet_type { __be16 type; /* This is really htons(ether_type). */ bool ignore_outgoing; struct net_device *dev; /* NULL is wildcarded here */ + netdevice_tracker dev_tracker; int (*func) (struct sk_buff *, struct net_device *, struct packet_type *, -- cgit v1.2.3 From 685b1afd7911676691c4167f420e16a957f5a38e Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 9 Dec 2021 12:09:23 +0200 Subject: net/mlx5: Introduce log_max_current_uc_list_wr_supported bit Downstream patch will use this bit in order to know whether the device supports changing of max_uc_list. Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3636df90899a..d3899fc33fd7 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1621,7 +1621,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 ext_stride_num_range[0x1]; u8 roce_rw_supported[0x1]; - u8 reserved_at_3a2[0x1]; + u8 log_max_current_uc_list_wr_supported[0x1]; u8 log_max_stride_sz_rq[0x5]; u8 reserved_at_3a8[0x3]; u8 log_min_stride_sz_rq[0x5]; -- cgit v1.2.3 From ff5f87cb6a75dbf6d30668d2464e46249dd5c47f Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 3 Dec 2021 11:28:49 +0100 Subject: clk: Introduce clk-tps68470 driver The TPS68470 PMIC provides Clocks, GPIOs and Regulators. At present in the kernel the Regulators and Clocks are controlled by an OpRegion driver designed to work with power control methods defined in ACPI, but some platforms lack those methods, meaning drivers need to be able to consume the resources of these chips through the usual frameworks. This commit adds a driver for the clocks provided by the tps68470, and is designed to bind to the platform_device registered by the intel_skl_int3472 module. This is based on this out of tree driver written by Intel: https://github.com/intel/linux-intel-lts/blob/4.14/base/drivers/clk/clk-tps68470.c with various cleanups added. Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20211203102857.44539-7-hdegoede@redhat.com Signed-off-by: Stephen Boyd --- include/linux/mfd/tps68470.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/tps68470.h b/include/linux/mfd/tps68470.h index ffe81127d91c..7807fa329db0 100644 --- a/include/linux/mfd/tps68470.h +++ b/include/linux/mfd/tps68470.h @@ -75,6 +75,17 @@ #define TPS68470_CLKCFG1_MODE_A_MASK GENMASK(1, 0) #define TPS68470_CLKCFG1_MODE_B_MASK GENMASK(3, 2) +#define TPS68470_CLKCFG2_DRV_STR_2MA 0x05 +#define TPS68470_PLL_OUTPUT_ENABLE 0x02 +#define TPS68470_CLK_SRC_XTAL BIT(0) +#define TPS68470_PLLSWR_DEFAULT GENMASK(1, 0) +#define TPS68470_OSC_EXT_CAP_DEFAULT 0x05 + +#define TPS68470_OUTPUT_A_SHIFT 0x00 +#define TPS68470_OUTPUT_B_SHIFT 0x02 +#define TPS68470_CLK_SRC_SHIFT GENMASK(2, 0) +#define TPS68470_OSC_EXT_CAP_SHIFT BIT(2) + #define TPS68470_GPIO_CTL_REG_A(x) (TPS68470_REG_GPCTL0A + (x) * 2) #define TPS68470_GPIO_CTL_REG_B(x) (TPS68470_REG_GPCTL0B + (x) * 2) #define TPS68470_GPIO_MODE_MASK GENMASK(1, 0) -- cgit v1.2.3 From dfd0743f1d9ea76931510ed150334d571fbab49d Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Thu, 9 Dec 2021 15:59:37 +0100 Subject: tee: handle lookup of shm with reference count 0 Since the tee subsystem does not keep a strong reference to its idle shared memory buffers, it races with other threads that try to destroy a shared memory through a close of its dma-buf fd or by unmapping the memory. In tee_shm_get_from_id() when a lookup in teedev->idr has been successful, it is possible that the tee_shm is in the dma-buf teardown path, but that path is blocked by the teedev mutex. Since we don't have an API to tell if the tee_shm is in the dma-buf teardown path or not we must find another way of detecting this condition. Fix this by doing the reference counting directly on the tee_shm using a new refcount_t refcount field. dma-buf is replaced by using anon_inode_getfd() instead, this separates the life-cycle of the underlying file from the tee_shm. tee_shm_put() is updated to hold the mutex when decreasing the refcount to 0 and then remove the tee_shm from teedev->idr before releasing the mutex. This means that the tee_shm can never be found unless it has a refcount larger than 0. Fixes: 967c9cca2cc5 ("tee: generic TEE subsystem") Cc: stable@vger.kernel.org Reviewed-by: Greg Kroah-Hartman Reviewed-by: Lars Persson Reviewed-by: Sumit Garg Reported-by: Patrik Lantz Signed-off-by: Jens Wiklander --- include/linux/tee_drv.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index a1f03461369b..cf5999626e28 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -195,7 +195,7 @@ int tee_session_calc_client_uuid(uuid_t *uuid, u32 connection_method, * @offset: offset of buffer in user space * @pages: locked pages from userspace * @num_pages: number of locked pages - * @dmabuf: dmabuf used to for exporting to user space + * @refcount: reference counter * @flags: defined by TEE_SHM_* in tee_drv.h * @id: unique id of a shared memory object on this device, shared * with user space @@ -214,7 +214,7 @@ struct tee_shm { unsigned int offset; struct page **pages; size_t num_pages; - struct dma_buf *dmabuf; + refcount_t refcount; u32 flags; int id; u64 sec_world_id; -- cgit v1.2.3 From d1e86325af377129adb7fc6f34eb044ca6068b47 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 15 Dec 2021 15:34:15 +0000 Subject: net: phylink: add mac_select_pcs() method to phylink_mac_ops mac_select_pcs() allows us to have an explicit point to query which PCS the MAC wishes to use for a particular PHY interface mode, thereby allowing us to add support to validate the link settings with the PCS. Phylink will also use this to select the PCS to be used during a major configuration event without the MAC driver needing to call phylink_set_pcs(). Note that if mac_select_pcs() is present, the supported_interfaces bitmap must be filled in; this avoids mac_select_pcs() being called with PHY_INTERFACE_MODE_NA when we want to get support for all interface types. Phylink will return an error in phylink_create() unless this condition is satisfied. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phylink.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index a2f266cc3442..b3086dcafeaf 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -112,6 +112,7 @@ struct phylink_config { /** * struct phylink_mac_ops - MAC operations structure. * @validate: Validate and update the link configuration. + * @mac_select_pcs: Select a PCS for the interface mode. * @mac_pcs_get_state: Read the current link state from the hardware. * @mac_prepare: prepare for a major reconfiguration of the interface. * @mac_config: configure the MAC for the selected mode and state. @@ -126,6 +127,8 @@ struct phylink_mac_ops { void (*validate)(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); + struct phylink_pcs *(*mac_select_pcs)(struct phylink_config *config, + phy_interface_t interface); void (*mac_pcs_get_state)(struct phylink_config *config, struct phylink_link_state *state); int (*mac_prepare)(struct phylink_config *config, unsigned int mode, @@ -178,6 +181,21 @@ struct phylink_mac_ops { */ void validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); +/** + * mac_select_pcs: Select a PCS for the interface mode. + * @config: a pointer to a &struct phylink_config. + * @interface: PHY interface mode for PCS + * + * Return the &struct phylink_pcs for the specified interface mode, or + * NULL if none is required, or an error pointer on error. + * + * This must not modify any state. It is used to query which PCS should + * be used. Phylink will use this during validation to ensure that the + * configuration is valid, and when setting a configuration to internally + * set the PCS that will be used. + */ +struct phylink_pcs *mac_select_pcs(struct phylink_config *config, + phy_interface_t interface); /** * mac_pcs_get_state() - Read the current inband link state from the hardware -- cgit v1.2.3 From 0d22d4b626a4eaa3196019092eb6c1919e9f8caa Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 15 Dec 2021 15:34:20 +0000 Subject: net: phylink: add pcs_validate() method Add a hook for PCS to validate the link parameters. This avoids MAC drivers having to have knowledge of their PCS in their validate() method, thereby allowing several MAC drivers to be simplfied. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phylink.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index b3086dcafeaf..713a0c928b7c 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -416,6 +416,7 @@ struct phylink_pcs { /** * struct phylink_pcs_ops - MAC PCS operations structure. + * @pcs_validate: validate the link configuration. * @pcs_get_state: read the current MAC PCS link state from the hardware. * @pcs_config: configure the MAC PCS for the selected mode and state. * @pcs_an_restart: restart 802.3z BaseX autonegotiation. @@ -423,6 +424,8 @@ struct phylink_pcs { * (where necessary). */ struct phylink_pcs_ops { + int (*pcs_validate)(struct phylink_pcs *pcs, unsigned long *supported, + const struct phylink_link_state *state); void (*pcs_get_state)(struct phylink_pcs *pcs, struct phylink_link_state *state); int (*pcs_config)(struct phylink_pcs *pcs, unsigned int mode, @@ -435,6 +438,23 @@ struct phylink_pcs_ops { }; #if 0 /* For kernel-doc purposes only. */ +/** + * pcs_validate() - validate the link configuration. + * @pcs: a pointer to a &struct phylink_pcs. + * @supported: ethtool bitmask for supported link modes. + * @state: a const pointer to a &struct phylink_link_state. + * + * Validate the interface mode, and advertising's autoneg bit, removing any + * media ethtool link modes that would not be supportable from the supported + * mask. Phylink will propagate the changes to the advertising mask. See the + * &struct phylink_mac_ops validate() method. + * + * Returns -EINVAL if the interface mode/autoneg mode is not supported. + * Returns non-zero positive if the link state can be supported. + */ +int pcs_validate(struct phylink_pcs *pcs, unsigned long *supported, + const struct phylink_link_state *state); + /** * pcs_get_state() - Read the current inband link state from the hardware * @pcs: a pointer to a &struct phylink_pcs. -- cgit v1.2.3 From c6aeaf56f468a565f6d2f27325fc07d35cdcd3cb Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 9 Sep 2021 15:51:24 +0200 Subject: drm/tegra: Implement correct DMA-BUF semantics DMA-BUF requires that each device that accesses a DMA-BUF attaches to it separately. To do so the host1x_bo_pin() and host1x_bo_unpin() functions need to be reimplemented so that they can return a mapping, which either represents an attachment or a map of the driver's own GEM object. Signed-off-by: Thierry Reding --- include/linux/host1x.h | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/host1x.h b/include/linux/host1x.h index 7bccf589aba7..157326074df8 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -7,6 +7,7 @@ #define __LINUX_HOST1X_H #include +#include #include enum host1x_class { @@ -82,12 +83,23 @@ struct host1x_client { struct host1x_bo; struct sg_table; +struct host1x_bo_mapping { + struct dma_buf_attachment *attach; + enum dma_data_direction direction; + struct host1x_bo *bo; + struct sg_table *sgt; + unsigned int chunks; + struct device *dev; + dma_addr_t phys; + size_t size; +}; + struct host1x_bo_ops { struct host1x_bo *(*get)(struct host1x_bo *bo); void (*put)(struct host1x_bo *bo); - struct sg_table *(*pin)(struct device *dev, struct host1x_bo *bo, - dma_addr_t *phys); - void (*unpin)(struct device *dev, struct sg_table *sgt); + struct host1x_bo_mapping *(*pin)(struct device *dev, struct host1x_bo *bo, + enum dma_data_direction dir); + void (*unpin)(struct host1x_bo_mapping *map); void *(*mmap)(struct host1x_bo *bo); void (*munmap)(struct host1x_bo *bo, void *addr); }; @@ -112,17 +124,15 @@ static inline void host1x_bo_put(struct host1x_bo *bo) bo->ops->put(bo); } -static inline struct sg_table *host1x_bo_pin(struct device *dev, - struct host1x_bo *bo, - dma_addr_t *phys) +static inline struct host1x_bo_mapping *host1x_bo_pin(struct device *dev, struct host1x_bo *bo, + enum dma_data_direction dir) { - return bo->ops->pin(dev, bo, phys); + return bo->ops->pin(dev, bo, dir); } -static inline void host1x_bo_unpin(struct device *dev, struct host1x_bo *bo, - struct sg_table *sgt) +static inline void host1x_bo_unpin(struct host1x_bo_mapping *map) { - bo->ops->unpin(dev, sgt); + map->bo->ops->unpin(map); } static inline void *host1x_bo_mmap(struct host1x_bo *bo) -- cgit v1.2.3 From 1f39b1dfa53c84b56d7ad37fed44afda7004959d Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Fri, 7 Feb 2020 16:50:52 +0100 Subject: drm/tegra: Implement buffer object cache This cache is used to avoid mapping and unmapping buffer objects unnecessarily. Mappings are cached per client and stay hot until the buffer object is destroyed. Signed-off-by: Thierry Reding --- include/linux/host1x.h | 53 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/host1x.h b/include/linux/host1x.h index 157326074df8..d0bb16cdd005 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -8,6 +8,7 @@ #include #include +#include #include enum host1x_class { @@ -24,6 +25,28 @@ struct iommu_group; u64 host1x_get_dma_mask(struct host1x *host1x); +/** + * struct host1x_bo_cache - host1x buffer object cache + * @mappings: list of mappings + * @lock: synchronizes accesses to the list of mappings + */ +struct host1x_bo_cache { + struct list_head mappings; + struct mutex lock; +}; + +static inline void host1x_bo_cache_init(struct host1x_bo_cache *cache) +{ + INIT_LIST_HEAD(&cache->mappings); + mutex_init(&cache->lock); +} + +static inline void host1x_bo_cache_destroy(struct host1x_bo_cache *cache) +{ + /* XXX warn if not empty? */ + mutex_destroy(&cache->lock); +} + /** * struct host1x_client_ops - host1x client operations * @early_init: host1x client early initialization code @@ -74,6 +97,8 @@ struct host1x_client { struct host1x_client *parent; unsigned int usecount; struct mutex lock; + + struct host1x_bo_cache cache; }; /* @@ -84,16 +109,26 @@ struct host1x_bo; struct sg_table; struct host1x_bo_mapping { + struct kref ref; struct dma_buf_attachment *attach; enum dma_data_direction direction; + struct list_head list; struct host1x_bo *bo; struct sg_table *sgt; unsigned int chunks; struct device *dev; dma_addr_t phys; size_t size; + + struct host1x_bo_cache *cache; + struct list_head entry; }; +static inline struct host1x_bo_mapping *to_host1x_bo_mapping(struct kref *ref) +{ + return container_of(ref, struct host1x_bo_mapping, ref); +} + struct host1x_bo_ops { struct host1x_bo *(*get)(struct host1x_bo *bo); void (*put)(struct host1x_bo *bo); @@ -106,11 +141,15 @@ struct host1x_bo_ops { struct host1x_bo { const struct host1x_bo_ops *ops; + struct list_head mappings; + spinlock_t lock; }; static inline void host1x_bo_init(struct host1x_bo *bo, const struct host1x_bo_ops *ops) { + INIT_LIST_HEAD(&bo->mappings); + spin_lock_init(&bo->lock); bo->ops = ops; } @@ -124,16 +163,10 @@ static inline void host1x_bo_put(struct host1x_bo *bo) bo->ops->put(bo); } -static inline struct host1x_bo_mapping *host1x_bo_pin(struct device *dev, struct host1x_bo *bo, - enum dma_data_direction dir) -{ - return bo->ops->pin(dev, bo, dir); -} - -static inline void host1x_bo_unpin(struct host1x_bo_mapping *map) -{ - map->bo->ops->unpin(map); -} +struct host1x_bo_mapping *host1x_bo_pin(struct device *dev, struct host1x_bo *bo, + enum dma_data_direction dir, + struct host1x_bo_cache *cache); +void host1x_bo_unpin(struct host1x_bo_mapping *map); static inline void *host1x_bo_mmap(struct host1x_bo *bo) { -- cgit v1.2.3 From 46f226c93d35b936aeec6eb31da932dc2e86f413 Mon Sep 17 00:00:00 2001 From: Mikko Perttunen Date: Thu, 16 Sep 2021 17:55:17 +0300 Subject: drm/tegra: Add NVDEC driver Add support for booting and using NVDEC on Tegra210, Tegra186 and Tegra194 to the Host1x and TegraDRM drivers. Booting in secure mode is not currently supported. Signed-off-by: Mikko Perttunen Signed-off-by: Thierry Reding --- include/linux/host1x.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/host1x.h b/include/linux/host1x.h index d0bb16cdd005..2ca53d7ed7ca 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -17,6 +17,8 @@ enum host1x_class { HOST1X_CLASS_GR2D_SB = 0x52, HOST1X_CLASS_VIC = 0x5D, HOST1X_CLASS_GR3D = 0x60, + HOST1X_CLASS_NVDEC = 0xF0, + HOST1X_CLASS_NVDEC1 = 0xF5, }; struct host1x; -- cgit v1.2.3 From 9ca790f44606109071ab1a3a37ed99e91794c37c Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Wed, 1 Dec 2021 02:23:16 +0300 Subject: gpu: host1x: Add host1x_channel_stop() Add host1x_channel_stop() which waits till channel becomes idle and then stops the channel hardware. This is needed for supporting suspend/resume by host1x drivers since the hardware state is lost after power-gating, thus the channel needs to be stopped before client enters into suspend. Tested-by: Peter Geis # Ouya T30 Tested-by: Paul Fertser # PAZ00 T20 Tested-by: Nicolas Chauvet # PAZ00 T20 and TK1 T124 Tested-by: Matt Merhar # Ouya T30 Signed-off-by: Dmitry Osipenko Signed-off-by: Thierry Reding --- include/linux/host1x.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/host1x.h b/include/linux/host1x.h index 2ca53d7ed7ca..e8dc5bc41f79 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -226,6 +226,7 @@ struct host1x_job; struct host1x_channel *host1x_channel_request(struct host1x_client *client); struct host1x_channel *host1x_channel_get(struct host1x_channel *channel); +void host1x_channel_stop(struct host1x_channel *channel); void host1x_channel_put(struct host1x_channel *channel); int host1x_job_submit(struct host1x_job *job); -- cgit v1.2.3 From c0cdc89072a3e1ae3981437f385de14b7bba8fd8 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 27 Oct 2021 16:15:04 +0100 Subject: irqchip/gic-v3-its: Give the percpu rdist struct its own flags field Later patches will require tracking some per-rdist status. Reuse the bytes "lost" to padding within the __percpu rdist struct as a flags field, and re-encode ->lpi_enabled within said flags. No change in functionality intended. Signed-off-by: Valentin Schneider Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20211027151506.2085066-2-valentin.schneider@arm.com --- include/linux/irqchip/arm-gic-v3.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 81cbf85f73de..0dc34d7d735a 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -615,7 +615,7 @@ struct rdists { void __iomem *rd_base; struct page *pend_page; phys_addr_t phys_base; - bool lpi_enabled; + u64 flags; cpumask_t *vpe_table_mask; void *vpe_l1_base; } __percpu *rdist; -- cgit v1.2.3 From d23bc2bc1d634658d7fa96395419c1c553a784f0 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 27 Oct 2021 16:15:05 +0100 Subject: irqchip/gic-v3-its: Postpone LPI pending table freeing and memreserve Memory used by the LPI tables have to be made persistent for kexec to have a chance to work, as explained in [1]. If they have been made persistent and we are booting into a kexec'd kernel, we also need to free the pages that were preemptively allocated by the new kernel for those tables. Both of those operations currently happen during its_cpu_init(), which happens in a _STARTING (IOW atomic) cpuhp callback for secondary CPUs. efi_mem_reserve_iomem() issues a GFP_ATOMIC allocation, which unfortunately doesn't work under PREEMPT_RT (this ends up grabbing a non-raw spinlock, which can sleep under PREEMPT_RT). Similarly, freeing the pages ends up grabbing a sleepable spinlock. Since the memreserve is only required by kexec, it doesn't have to be done so early in the secondary boot process. Issue the reservation in a new CPUHP_AP_ONLINE_DYN cpuhp callback, and piggy-back the page freeing on top of it. A CPU gets to run the body of this new callback exactly once. As kexec issues a machine_shutdown() prior to machine_kexec(), it will be serialized vs a CPU being plugged to life by the hotplug machinery - either the CPU will have been brought up and have had its redistributor's pending table memreserved, or it never went online and will have its table allocated by the new kernel. [1]: https://lore.kernel.org/lkml/20180921195954.21574-1-marc.zyngier@arm.com/ Signed-off-by: Valentin Schneider Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20211027151506.2085066-3-valentin.schneider@arm.com --- include/linux/irqchip/arm-gic-v3.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 0dc34d7d735a..51b85506ae90 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -632,6 +632,7 @@ struct rdists { struct irq_domain; struct fwnode_handle; +int __init its_lpi_memreserve_init(void); int its_cpu_init(void); int its_init(struct fwnode_handle *handle, struct rdists *rdists, struct irq_domain *domain); -- cgit v1.2.3 From 835f442fdbce33a47a6bde356643fd7e3ef7ec1b Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 27 Oct 2021 16:15:06 +0100 Subject: irqchip/gic-v3-its: Limit memreserve cpuhp state lifetime The new memreserve cpuhp callback only needs to survive up until a point where every CPU in the system has booted once. Beyond that, it becomes a no-op and can be put in the bin. Signed-off-by: Valentin Schneider Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20211027151506.2085066-4-valentin.schneider@arm.com --- include/linux/irqchip/arm-gic-v3.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 51b85506ae90..12d91f0dedf9 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -624,6 +624,7 @@ struct rdists { u64 flags; u32 gicd_typer; u32 gicd_typer2; + int cpuhp_memreserve_state; bool has_vlpis; bool has_rvpeid; bool has_direct_lpi; -- cgit v1.2.3 From 3c67d44de787dff288d7f2a51c372b22f7356db6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Dec 2021 06:48:53 -0700 Subject: block: add mq_ops->queue_rqs hook If we have a list of requests in our plug list, send it to the driver in one go, if possible. The driver must set mq_ops->queue_rqs() to support this, if not the usual one-by-one path is used. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 772f8f921526..550996cf419c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -492,6 +492,14 @@ struct blk_mq_ops { */ void (*commit_rqs)(struct blk_mq_hw_ctx *); + /** + * @queue_rqs: Queue a list of new requests. Driver is guaranteed + * that each request belongs to the same queue. If the driver doesn't + * empty the @rqlist completely, then the rest will be queued + * individually by the block layer upon return. + */ + void (*queue_rqs)(struct request **rqlist); + /** * @get_budget: Reserve budget before queue request, once .queue_rq is * run, it is driver's responsibility to release the -- cgit v1.2.3 From 8a2ba1785c5803d59a63b6320ff54fd4a37a41ce Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Dec 2021 07:31:21 +0100 Subject: block: remove the nr_task field from struct io_context Nothing ever looks at ->nr_tasks, so remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20211209063131.18537-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/iocontext.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index c1229fbd6691..82c7f4f5f4f5 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -99,7 +99,6 @@ struct io_cq { struct io_context { atomic_long_t refcount; atomic_t active_ref; - atomic_t nr_tasks; /* all the fields below are protected by this lock */ spinlock_t lock; -- cgit v1.2.3 From a411cd3cfdc5bbd1329d5b33dbf39e2b5213969d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Dec 2021 07:31:27 +0100 Subject: block: move set_task_ioprio to blk-ioc.c Keep set_task_ioprio with the other low-level code that accesses the io_context structure. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20211209063131.18537-8-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/iocontext.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 82c7f4f5f4f5..648331f35fc6 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -116,8 +116,6 @@ struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); void exit_io_context(struct task_struct *task); -struct io_context *get_task_io_context(struct task_struct *task, - gfp_t gfp_flags, int node); int __copy_io(unsigned long clone_flags, struct task_struct *tsk); static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) { -- cgit v1.2.3 From 5ef1630586317e92c9ebd7b4ce48f393b7ff790f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Dec 2021 07:31:31 +0100 Subject: block: only build the icq tracking code when needed Only bfq needs to code to track icq, so make it conditional. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20211209063131.18537-12-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/iocontext.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 648331f35fc6..14f7eaf1b443 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -100,16 +100,18 @@ struct io_context { atomic_long_t refcount; atomic_t active_ref; + unsigned short ioprio; + +#ifdef CONFIG_BLK_ICQ /* all the fields below are protected by this lock */ spinlock_t lock; - unsigned short ioprio; - struct radix_tree_root icq_tree; struct io_cq __rcu *icq_hint; struct hlist_head icq_list; struct work_struct release_work; +#endif /* CONFIG_BLK_ICQ */ }; struct task_struct; -- cgit v1.2.3 From 85f5a74c2b9ba213d4102dc12ccbfdbe26472abb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 8 Apr 2021 01:33:45 -0400 Subject: block: Add bio_add_folio() This is a thin wrapper around bio_add_page(). The main advantage here is the documentation that folios larger than 2GiB are not supported. It's not currently possible to allocate folios that large, but if it ever becomes possible, this function will fail gracefully instead of doing I/O to the wrong bytes. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jens Axboe Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- include/linux/bio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index fe6bdfbbef66..a783cac49978 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -409,7 +409,8 @@ extern void bio_uninit(struct bio *); extern void bio_reset(struct bio *); void bio_chain(struct bio *, struct bio *); -extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); +int bio_add_page(struct bio *, struct page *, unsigned len, unsigned off); +bool bio_add_folio(struct bio *, struct folio *, size_t len, size_t off); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); int bio_add_zone_append_page(struct bio *bio, struct page *page, -- cgit v1.2.3 From 640d1930bef4f87ec8d8d2b05f0f6edc1dfcf662 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 4 Jan 2021 10:58:17 -0500 Subject: block: Add bio_for_each_folio_all() Allow callers to iterate over each folio instead of each page. The bio need not have been constructed using folios originally. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jens Axboe Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- include/linux/bio.h | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index a783cac49978..e3c9e8207f12 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -166,7 +166,7 @@ static inline void bio_advance(struct bio *bio, unsigned int nbytes) */ #define bio_for_each_bvec_all(bvl, bio, i) \ for (i = 0, bvl = bio_first_bvec_all(bio); \ - i < (bio)->bi_vcnt; i++, bvl++) \ + i < (bio)->bi_vcnt; i++, bvl++) #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len) @@ -260,6 +260,57 @@ static inline struct bio_vec *bio_last_bvec_all(struct bio *bio) return &bio->bi_io_vec[bio->bi_vcnt - 1]; } +/** + * struct folio_iter - State for iterating all folios in a bio. + * @folio: The current folio we're iterating. NULL after the last folio. + * @offset: The byte offset within the current folio. + * @length: The number of bytes in this iteration (will not cross folio + * boundary). + */ +struct folio_iter { + struct folio *folio; + size_t offset; + size_t length; + /* private: for use by the iterator */ + size_t _seg_count; + int _i; +}; + +static inline void bio_first_folio(struct folio_iter *fi, struct bio *bio, + int i) +{ + struct bio_vec *bvec = bio_first_bvec_all(bio) + i; + + fi->folio = page_folio(bvec->bv_page); + fi->offset = bvec->bv_offset + + PAGE_SIZE * (bvec->bv_page - &fi->folio->page); + fi->_seg_count = bvec->bv_len; + fi->length = min(folio_size(fi->folio) - fi->offset, fi->_seg_count); + fi->_i = i; +} + +static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) +{ + fi->_seg_count -= fi->length; + if (fi->_seg_count) { + fi->folio = folio_next(fi->folio); + fi->offset = 0; + fi->length = min(folio_size(fi->folio), fi->_seg_count); + } else if (fi->_i + 1 < bio->bi_vcnt) { + bio_first_folio(fi, bio, fi->_i + 1); + } else { + fi->folio = NULL; + } +} + +/** + * bio_for_each_folio_all - Iterate over each folio in a bio. + * @fi: struct folio_iter which is updated for each folio. + * @bio: struct bio to iterate over. + */ +#define bio_for_each_folio_all(fi, bio) \ + for (bio_first_folio(&fi, bio, 0); fi.folio; bio_next_folio(&fi, bio)) + enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ -- cgit v1.2.3 From 8306a5f56305521d8b307b4ee1f69949fbb49279 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 28 Apr 2021 07:51:36 -0400 Subject: iomap: Add iomap_invalidate_folio Keep iomap_invalidatepage around as a wrapper for use in address_space operations. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- include/linux/iomap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 6d1b08d0ae93..29491fb9c5ba 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -225,6 +225,7 @@ void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); int iomap_is_partially_uptodate(struct page *page, unsigned long from, unsigned long count); int iomap_releasepage(struct page *page, gfp_t gfp_mask); +void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); void iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len); #ifdef CONFIG_MIGRATION -- cgit v1.2.3 From ea6fa4961aab8f90a8aa03575a98b4bda368d4b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Jo=C5=84czyk?= Date: Fri, 10 Dec 2021 21:01:26 +0100 Subject: rtc: mc146818-lib: fix RTC presence check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To prevent an infinite loop in mc146818_get_time(), commit 211e5db19d15 ("rtc: mc146818: Detect and handle broken RTCs") added a check for RTC availability. Together with a later fix, it checked if bit 6 in register 0x0d is cleared. This, however, caused a false negative on a motherboard with an AMD SB710 southbridge; according to the specification [1], bit 6 of register 0x0d of this chipset is a scratchbit. This caused a regression in Linux 5.11 - the RTC was determined broken by the kernel and not used by rtc-cmos.c [3]. This problem was also reported in Fedora [4]. As a better alternative, check whether the UIP ("Update-in-progress") bit is set for longer then 10ms. If that is the case, then apparently the RTC is either absent (and all register reads return 0xff) or broken. Also limit the number of loop iterations in mc146818_get_time() to 10 to prevent an infinite loop there. The functions mc146818_get_time() and mc146818_does_rtc_work() will be refactored later in this patch series, in order to fix a separate problem with reading / setting the RTC alarm time. This is done so to avoid a confusion about what is being fixed when. In a previous approach to this problem, I implemented a check whether the RTC_HOURS register contains a value <= 24. This, however, sometimes did not work correctly on my Intel Kaby Lake laptop. According to Intel's documentation [2], "the time and date RAM locations (0-9) are disconnected from the external bus" during the update cycle so reading this register without checking the UIP bit is incorrect. [1] AMD SB700/710/750 Register Reference Guide, page 308, https://developer.amd.com/wordpress/media/2012/10/43009_sb7xx_rrg_pub_1.00.pdf [2] 7th Generation Intel ® Processor Family I/O for U/Y Platforms [...] Datasheet Volume 1 of 2, page 209 Intel's Document Number: 334658-006, https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/7th-and-8th-gen-core-family-mobile-u-y-processor-lines-i-o-datasheet-vol-1.pdf [3] Functions in arch/x86/kernel/rtc.c apparently were using it. [4] https://bugzilla.redhat.com/show_bug.cgi?id=1936688 Fixes: 211e5db19d15 ("rtc: mc146818: Detect and handle broken RTCs") Fixes: ebb22a059436 ("rtc: mc146818: Dont test for bit 0-5 in Register D") Signed-off-by: Mateusz Jończyk Cc: Thomas Gleixner Cc: Alessandro Zummo Cc: Alexandre Belloni Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211210200131.153887-5-mat.jonczyk@o2.pl --- include/linux/mc146818rtc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h index 0661af17a758..69c80c4325bf 100644 --- a/include/linux/mc146818rtc.h +++ b/include/linux/mc146818rtc.h @@ -123,6 +123,7 @@ struct cmos_rtc_board_info { #define RTC_IO_EXTENT_USED RTC_IO_EXTENT #endif /* ARCH_RTC_LOCATION */ +bool mc146818_does_rtc_work(void); unsigned int mc146818_get_time(struct rtc_time *time); int mc146818_set_time(struct rtc_time *time); -- cgit v1.2.3 From ec5895c0f2d87b9bf4185db1915e40fa6fcfc0ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Jo=C5=84czyk?= Date: Fri, 10 Dec 2021 21:01:27 +0100 Subject: rtc: mc146818-lib: extract mc146818_avoid_UIP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Function mc146818_get_time() contains an elaborate mechanism of reading the RTC time while no RTC update is in progress. It turns out that reading the RTC alarm clock also requires avoiding the RTC update. Therefore, the mechanism in mc146818_get_time() should be reused - so extract it into a separate function. The logic in mc146818_avoid_UIP() is same as in mc146818_get_time() except that after every if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) { there is now "mdelay(1)". To avoid producing a very unreadable patch, mc146818_get_time() will be refactored to use mc146818_avoid_UIP() in the next patch. Signed-off-by: Mateusz Jończyk Cc: Alessandro Zummo Cc: Alexandre Belloni Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211210200131.153887-6-mat.jonczyk@o2.pl --- include/linux/mc146818rtc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h index 69c80c4325bf..67fb0a12becc 100644 --- a/include/linux/mc146818rtc.h +++ b/include/linux/mc146818rtc.h @@ -127,4 +127,7 @@ bool mc146818_does_rtc_work(void); unsigned int mc146818_get_time(struct rtc_time *time); int mc146818_set_time(struct rtc_time *time); +bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param), + void *param); + #endif /* _MC146818RTC_H */ -- cgit v1.2.3 From 34fff62827b254f8a43633cc878deb04bf11297c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:18:54 +0100 Subject: device: Move MSI related data into a struct The only unconditional part of MSI data in struct device is the irqdomain pointer. Everything else can be allocated on demand. Create a data structure and move the irqdomain pointer into it. The other MSI specific parts are going to be removed from struct device in later steps. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20211210221813.617178827@linutronix.de --- include/linux/device.h | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 2a22875238a6..f212b7a7b156 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -371,6 +371,16 @@ struct dev_links_info { enum dl_dev_state status; }; +/** + * struct dev_msi_info - Device data related to MSI + * @domain: The MSI interrupt domain associated to the device + */ +struct dev_msi_info { +#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN + struct irq_domain *domain; +#endif +}; + /** * struct device - The basic device structure * @parent: The device's "parent" device, the device to which it is attached. @@ -407,8 +417,8 @@ struct dev_links_info { * @em_pd: device's energy model performance domain * @pins: For device pin management. * See Documentation/driver-api/pin-control.rst for details. + * @msi: MSI related data * @msi_list: Hosts MSI descriptors - * @msi_domain: The generic MSI domain this device is using. * @numa_node: NUMA node this device is close to. * @dma_ops: DMA mapping operations for this device. * @dma_mask: Dma mask (if dma'ble device). @@ -500,12 +510,10 @@ struct device { struct em_perf_domain *em_pd; #endif -#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN - struct irq_domain *msi_domain; -#endif #ifdef CONFIG_PINCTRL struct dev_pin_info *pins; #endif + struct dev_msi_info msi; #ifdef CONFIG_GENERIC_MSI_IRQ struct list_head msi_list; #endif @@ -666,7 +674,7 @@ static inline void set_dev_node(struct device *dev, int node) static inline struct irq_domain *dev_get_msi_domain(const struct device *dev) { #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN - return dev->msi_domain; + return dev->msi.domain; #else return NULL; #endif @@ -675,7 +683,7 @@ static inline struct irq_domain *dev_get_msi_domain(const struct device *dev) static inline void dev_set_msi_domain(struct device *dev, struct irq_domain *d) { #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN - dev->msi_domain = d; + dev->msi.domain = d; #endif } -- cgit v1.2.3 From 013bd8e543c2c777b586cf033c588ea82bd502db Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:18:55 +0100 Subject: device: Add device:: Msi_data pointer and struct msi_device_data Create struct msi_device_data and add a pointer of that type to struct dev_msi_info, which is part of struct device. Provide an allocator function which can be invoked from the MSI interrupt allocation code pathes. Add a properties field to the data structure as a first member so the allocation size is not zero bytes. The field will be uses later on. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211210221813.676660809@linutronix.de --- include/linux/device.h | 5 +++++ include/linux/msi.h | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index f212b7a7b156..f0033cd93631 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -45,6 +45,7 @@ struct iommu_ops; struct iommu_group; struct dev_pin_info; struct dev_iommu; +struct msi_device_data; /** * struct subsys_interface - interfaces to device functions @@ -374,11 +375,15 @@ struct dev_links_info { /** * struct dev_msi_info - Device data related to MSI * @domain: The MSI interrupt domain associated to the device + * @data: Pointer to MSI device data */ struct dev_msi_info { #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN struct irq_domain *domain; #endif +#ifdef CONFIG_GENERIC_MSI_IRQ + struct msi_device_data *data; +#endif }; /** diff --git a/include/linux/msi.h b/include/linux/msi.h index ba4a39c430b5..7e4c8fd7c65d 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -171,6 +171,16 @@ struct msi_desc { }; }; +/** + * msi_device_data - MSI per device data + * @properties: MSI properties which are interesting to drivers + */ +struct msi_device_data { + unsigned long properties; +}; + +int msi_setup_device_data(struct device *dev); + /* Helpers to hide struct msi_desc implementation details */ #define msi_desc_to_dev(desc) ((desc)->dev) #define dev_to_msi_list(dev) (&(dev)->msi_list) @@ -233,10 +243,16 @@ void pci_msi_mask_irq(struct irq_data *data); void pci_msi_unmask_irq(struct irq_data *data); #ifdef CONFIG_SYSFS +int msi_device_populate_sysfs(struct device *dev); +void msi_device_destroy_sysfs(struct device *dev); + const struct attribute_group **msi_populate_sysfs(struct device *dev); void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups); #else +static inline int msi_device_populate_sysfs(struct device *dev) { return 0; } +static inline void msi_device_destroy_sysfs(struct device *dev) { } + static inline const struct attribute_group **msi_populate_sysfs(struct device *dev) { return NULL; @@ -384,6 +400,8 @@ enum { MSI_FLAG_MUST_REACTIVATE = (1 << 5), /* Is level-triggered capable, using two messages */ MSI_FLAG_LEVEL_CAPABLE = (1 << 6), + /* Populate sysfs on alloc() and destroy it on free() */ + MSI_FLAG_DEV_SYSFS = (1 << 7), }; int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask, -- cgit v1.2.3 From 3f35d2cf9fbc656db82579d849cc69c373b1ad0d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Dec 2021 18:16:44 +0100 Subject: PCI/MSI: Decouple MSI[-X] disable from pcim_release() The MSI core will introduce runtime allocation of MSI related data. This data will be devres managed and has to be set up before enabling PCI/MSI[-X]. This would introduce an ordering issue vs. pcim_release(). The setup order is: pcim_enable_device() devres_alloc(pcim_release...); ... pci_irq_alloc() msi_setup_device_data() devres_alloc(msi_device_data_release, ...) and once the device is released these release functions are invoked in the opposite order: msi_device_data_release() ... pcim_release() pci_disable_msi[x]() which is obviously wrong, because pci_disable_msi[x]() requires the MSI data to be available to tear down the MSI[-X] interrupts. Remove the MSI[-X] teardown from pcim_release() and add an explicit action to be installed on the attempt of enabling PCI/MSI[-X]. This allows the MSI core data allocation to be ordered correctly in a subsequent step. Reported-by: Nishanth Menon Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/87tuf9rdoj.ffs@tglx --- include/linux/pci.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 5cc46baef519..a09736d3e05e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -425,7 +425,8 @@ struct pci_dev { unsigned int ats_enabled:1; /* Address Translation Svc */ unsigned int pasid_enabled:1; /* Process Address Space ID */ unsigned int pri_enabled:1; /* Page Request Interface */ - unsigned int is_managed:1; + unsigned int is_managed:1; /* Managed via devres */ + unsigned int is_msi_managed:1; /* MSI release via devres installed */ unsigned int needs_freset:1; /* Requires fundamental reset */ unsigned int state_saved:1; unsigned int is_physfn:1; -- cgit v1.2.3 From bf6e054e0e3fbc9614355b760e18c8a14f952a4e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:19:03 +0100 Subject: genirq/msi: Provide msi_device_populate/destroy_sysfs() Add new allocation functions which can be activated by domain info flags. They store the groups pointer in struct msi_device_data. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211210221813.988659194@linutronix.de --- include/linux/msi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 7e4c8fd7c65d..1b96dc483b88 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -56,6 +56,8 @@ struct irq_data; struct msi_desc; struct pci_dev; struct platform_msi_priv_data; +struct attribute_group; + void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg); #ifdef CONFIG_GENERIC_MSI_IRQ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg); @@ -174,9 +176,11 @@ struct msi_desc { /** * msi_device_data - MSI per device data * @properties: MSI properties which are interesting to drivers + * @attrs: Pointer to the sysfs attribute group */ struct msi_device_data { unsigned long properties; + const struct attribute_group **attrs; }; int msi_setup_device_data(struct device *dev); -- cgit v1.2.3 From ffd84485e6beb9cad3e5a133d88201b995298c33 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:19:05 +0100 Subject: PCI/MSI: Let the irq code handle sysfs groups Set the domain info flag which makes the core code handle sysfs groups and put an explicit invocation into the legacy code. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20211210221814.048612053@linutronix.de --- include/linux/pci.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index a09736d3e05e..0a7b6b2f163b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -476,7 +476,6 @@ struct pci_dev { #ifdef CONFIG_PCI_MSI void __iomem *msix_base; raw_spinlock_t msi_lock; - const struct attribute_group **msi_irq_groups; #endif struct pci_vpd vpd; #ifdef CONFIG_PCIE_DPC -- cgit v1.2.3 From 24cff375fdb663c2238f06693a067b9219596fdc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:19:08 +0100 Subject: genirq/msi: Remove the original sysfs interfaces No more users. Refactor the core code accordingly and move the global interface under CONFIG_PCI_MSI_ARCH_FALLBACKS. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211210221814.168362229@linutronix.de --- include/linux/msi.h | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 1b96dc483b88..634a12962e72 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -249,22 +249,10 @@ void pci_msi_unmask_irq(struct irq_data *data); #ifdef CONFIG_SYSFS int msi_device_populate_sysfs(struct device *dev); void msi_device_destroy_sysfs(struct device *dev); - -const struct attribute_group **msi_populate_sysfs(struct device *dev); -void msi_destroy_sysfs(struct device *dev, - const struct attribute_group **msi_irq_groups); -#else +#else /* CONFIG_SYSFS */ static inline int msi_device_populate_sysfs(struct device *dev) { return 0; } static inline void msi_device_destroy_sysfs(struct device *dev) { } - -static inline const struct attribute_group **msi_populate_sysfs(struct device *dev) -{ - return NULL; -} -static inline void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups) -{ -} -#endif +#endif /* !CONFIG_SYSFS */ /* * The arch hooks to setup up msi irqs. Default functions are implemented @@ -279,7 +267,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc); void arch_teardown_msi_irq(unsigned int irq); int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void arch_teardown_msi_irqs(struct pci_dev *dev); -#endif +#endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */ /* * The restore hook is still available even for fully irq domain based -- cgit v1.2.3 From 9835cec6d557b0bff3d48bd91cd0484aba59386c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:19:09 +0100 Subject: platform-msi: Rename functions and clarify comments It's hard to distinguish what platform_msi_domain_alloc() and platform_msi_domain_alloc_irqs() are about. Make the distinction more explicit and add comments which explain the use cases properly. Signed-off-by: Thomas Gleixner Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211210221814.228706214@linutronix.de --- include/linux/msi.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 634a12962e72..12dd28629c43 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -435,10 +435,10 @@ __platform_msi_create_device_domain(struct device *dev, #define platform_msi_create_device_tree_domain(dev, nvec, write, ops, data) \ __platform_msi_create_device_domain(dev, nvec, true, write, ops, data) -int platform_msi_domain_alloc(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs); -void platform_msi_domain_free(struct irq_domain *domain, unsigned int virq, - unsigned int nvec); +int platform_msi_device_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs); +void platform_msi_device_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nvec); void *platform_msi_get_host_data(struct irq_domain *domain); #endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */ -- cgit v1.2.3 From fc22e7dbcdb3e06a3d3ce05fc91c6a2345411f9b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:19:11 +0100 Subject: platform-msi: Store platform private data pointer in msi_device_data Storing the platform private data in a MSI descriptor is sloppy at best. The data belongs to the device and not to the descriptor. Add a pointer to struct msi_device_data and store the pointer there. Signed-off-by: Thomas Gleixner Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211210221814.287680528@linutronix.de --- include/linux/msi.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 12dd28629c43..cdf0d09c3ad4 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -108,11 +108,9 @@ struct pci_msi_desc { /** * platform_msi_desc - Platform device specific msi descriptor data - * @msi_priv_data: Pointer to platform private data * @msi_index: The index of the MSI descriptor for multi MSI */ struct platform_msi_desc { - struct platform_msi_priv_data *msi_priv_data; u16 msi_index; }; @@ -177,10 +175,12 @@ struct msi_desc { * msi_device_data - MSI per device data * @properties: MSI properties which are interesting to drivers * @attrs: Pointer to the sysfs attribute group + * @platform_data: Platform-MSI specific data */ struct msi_device_data { unsigned long properties; const struct attribute_group **attrs; + struct platform_msi_priv_data *platform_data; }; int msi_setup_device_data(struct device *dev); -- cgit v1.2.3 From 20c6d424cfe641659c9a025db8a8608701b27246 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:19:12 +0100 Subject: genirq/msi: Consolidate MSI descriptor data All non PCI/MSI usage variants have data structures in struct msi_desc with only one member: xxx_index. PCI/MSI has a entry_nr member. Add a common msi_index member to struct msi_desc so all implementations can share it which allows further consolidation. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211210221814.350967317@linutronix.de --- include/linux/msi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index cdf0d09c3ad4..ee8fe49dedd1 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -143,6 +143,7 @@ struct ti_sci_inta_msi_desc { * address or data changes * @write_msi_msg_data: Data parameter for the callback. * + * @msi_index: Index of the msi descriptor * @pci: [PCI] PCI speficic msi descriptor data * @platform: [platform] Platform device specific msi descriptor data * @fsl_mc: [fsl-mc] FSL MC device specific msi descriptor data @@ -163,6 +164,7 @@ struct msi_desc { void (*write_msi_msg)(struct msi_desc *entry, void *data); void *write_msi_msg_data; + u16 msi_index; union { struct pci_msi_desc pci; struct platform_msi_desc platform; -- cgit v1.2.3 From dba27c7fa36f468e7eb29b216879f8c33bf0955d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:19:14 +0100 Subject: platform-msi: Use msi_desc::msi_index Use the common msi_index member and get rid of the pointless wrapper struct. Signed-off-by: Thomas Gleixner Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211210221814.413638645@linutronix.de --- include/linux/msi.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index ee8fe49dedd1..1d85e954e130 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -106,14 +106,6 @@ struct pci_msi_desc { }; }; -/** - * platform_msi_desc - Platform device specific msi descriptor data - * @msi_index: The index of the MSI descriptor for multi MSI - */ -struct platform_msi_desc { - u16 msi_index; -}; - /** * fsl_mc_msi_desc - FSL-MC device specific msi descriptor data * @msi_index: The index of the MSI descriptor @@ -145,7 +137,6 @@ struct ti_sci_inta_msi_desc { * * @msi_index: Index of the msi descriptor * @pci: [PCI] PCI speficic msi descriptor data - * @platform: [platform] Platform device specific msi descriptor data * @fsl_mc: [fsl-mc] FSL MC device specific msi descriptor data * @inta: [INTA] TISCI based INTA specific msi descriptor data */ @@ -167,7 +158,6 @@ struct msi_desc { u16 msi_index; union { struct pci_msi_desc pci; - struct platform_msi_desc platform; struct fsl_mc_msi_desc fsl_mc; struct ti_sci_inta_msi_desc inta; }; -- cgit v1.2.3 From 78ee9fb4b8b126ed84a819a6e1732fd3039b525a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:19:15 +0100 Subject: bus: fsl-mc-msi: Use msi_desc::msi_index Use the common msi_index member and get rid of the pointless wrapper struct. Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211210221814.477386185@linutronix.de --- include/linux/msi.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 1d85e954e130..25edf83ede41 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -106,14 +106,6 @@ struct pci_msi_desc { }; }; -/** - * fsl_mc_msi_desc - FSL-MC device specific msi descriptor data - * @msi_index: The index of the MSI descriptor - */ -struct fsl_mc_msi_desc { - u16 msi_index; -}; - /** * ti_sci_inta_msi_desc - TISCI based INTA specific msi descriptor data * @dev_index: TISCI device index @@ -137,7 +129,6 @@ struct ti_sci_inta_msi_desc { * * @msi_index: Index of the msi descriptor * @pci: [PCI] PCI speficic msi descriptor data - * @fsl_mc: [fsl-mc] FSL MC device specific msi descriptor data * @inta: [INTA] TISCI based INTA specific msi descriptor data */ struct msi_desc { @@ -158,7 +149,6 @@ struct msi_desc { u16 msi_index; union { struct pci_msi_desc pci; - struct fsl_mc_msi_desc fsl_mc; struct ti_sci_inta_msi_desc inta; }; }; -- cgit v1.2.3 From 0f18095871fc59c89a281caf6f18538cf9e50fbf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:19:17 +0100 Subject: soc: ti: ti_sci_inta_msi: Use msi_desc::msi_index Use the common msi_index member and get rid of the pointless wrapper struct. Signed-off-by: Thomas Gleixner Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Acked-by: Nishanth Menon Link: https://lore.kernel.org/r/20211210221814.540704224@linutronix.de --- include/linux/msi.h | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 25edf83ede41..45ec5d07a5f3 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -106,14 +106,6 @@ struct pci_msi_desc { }; }; -/** - * ti_sci_inta_msi_desc - TISCI based INTA specific msi descriptor data - * @dev_index: TISCI device index - */ -struct ti_sci_inta_msi_desc { - u16 dev_index; -}; - /** * struct msi_desc - Descriptor structure for MSI based interrupts * @list: List head for management @@ -128,8 +120,7 @@ struct ti_sci_inta_msi_desc { * @write_msi_msg_data: Data parameter for the callback. * * @msi_index: Index of the msi descriptor - * @pci: [PCI] PCI speficic msi descriptor data - * @inta: [INTA] TISCI based INTA specific msi descriptor data + * @pci: PCI specific msi descriptor data */ struct msi_desc { /* Shared device/bus type independent data */ @@ -147,10 +138,7 @@ struct msi_desc { void *write_msi_msg_data; u16 msi_index; - union { - struct pci_msi_desc pci; - struct ti_sci_inta_msi_desc inta; - }; + struct pci_msi_desc pci; }; /** -- cgit v1.2.3 From 173ffad79d177d9a91fbf3be6bf67ca81e0f765a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:19:18 +0100 Subject: PCI/MSI: Use msi_desc::msi_index The usage of msi_desc::pci::entry_nr is confusing at best. It's the index into the MSI[X] descriptor table. Use msi_desc::msi_index which is shared between all MSI incarnations instead of having a PCI specific storage for no value. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20211210221814.602911509@linutronix.de --- include/linux/msi.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 45ec5d07a5f3..b3d3b0bf59fe 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -80,7 +80,6 @@ typedef void (*irq_write_msi_msg_t)(struct msi_desc *desc, * @multi_cap: [PCI MSI/X] log2 num of messages supported * @can_mask: [PCI MSI/X] Masking supported? * @is_64: [PCI MSI/X] Address size: 0=32bit 1=64bit - * @entry_nr: [PCI MSI/X] Entry which is described by this descriptor * @default_irq:[PCI MSI/X] The default pre-assigned non-MSI irq * @mask_pos: [PCI MSI] Mask register position * @mask_base: [PCI MSI-X] Mask register base address @@ -97,7 +96,6 @@ struct pci_msi_desc { u8 can_mask : 1; u8 is_64 : 1; u8 is_virtual : 1; - u16 entry_nr; unsigned default_irq; } msi_attrib; union { -- cgit v1.2.3 From 7a823443e9b4ed1ff4a3026d184f09d23fd6d9c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:19:20 +0100 Subject: PCI/MSI: Provide MSI_FLAG_MSIX_CONTIGUOUS Provide a domain info flag which makes the core code check for a contiguous MSI-X index on allocation. That's simpler than checking it at some other domain callback in architecture code. Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20211210221814.662401116@linutronix.de --- include/linux/msi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index b3d3b0bf59fe..d206239e6fa8 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -362,6 +362,8 @@ enum { MSI_FLAG_LEVEL_CAPABLE = (1 << 6), /* Populate sysfs on alloc() and destroy it on free() */ MSI_FLAG_DEV_SYSFS = (1 << 7), + /* MSI-X entries must be contiguous */ + MSI_FLAG_MSIX_CONTIGUOUS = (1 << 8), }; int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask, -- cgit v1.2.3 From cf15f43acaad31dabb2646cef170a506a1d663eb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:19:23 +0100 Subject: genirq/msi: Provide interface to retrieve Linux interrupt number This allows drivers to retrieve the Linux interrupt number instead of fiddling with MSI descriptors. msi_get_virq() returns the Linux interrupt number or 0 in case that there is no entry for the given MSI index. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211210221814.780824745@linutronix.de --- include/linux/msi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index d206239e6fa8..7593fc383dba 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -153,6 +153,8 @@ struct msi_device_data { int msi_setup_device_data(struct device *dev); +unsigned int msi_get_virq(struct device *dev, unsigned int index); + /* Helpers to hide struct msi_desc implementation details */ #define msi_desc_to_dev(desc) ((desc)->dev) #define dev_to_msi_list(dev) (&(dev)->msi_list) -- cgit v1.2.3 From d86a6d47bcc6b41fe2a4e13313d66a772d00382f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:19:34 +0100 Subject: bus: fsl-mc: fsl-mc-allocator: Rework MSI handling Storing a pointer to the MSI descriptor just to track the Linux interrupt number is daft. Just store the interrupt number and be done with it. Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211210221815.207838579@linutronix.de --- include/linux/fsl/mc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index e026f6c48b49..7b6c42bfb660 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -91,13 +91,13 @@ struct fsl_mc_resource { /** * struct fsl_mc_device_irq - MC object device message-based interrupt - * @msi_desc: pointer to MSI descriptor allocated by fsl_mc_msi_alloc_descs() + * @virq: Linux virtual interrupt number * @mc_dev: MC object device that owns this interrupt * @dev_irq_index: device-relative IRQ index * @resource: MC generic resource associated with the interrupt */ struct fsl_mc_device_irq { - struct msi_desc *msi_desc; + unsigned int virq; struct fsl_mc_device *mc_dev; u8 dev_irq_index; struct fsl_mc_resource resource; -- cgit v1.2.3 From 89e0032ec201f76c86d6e3e6f94574dfb8e39b71 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:19:35 +0100 Subject: soc: ti: ti_sci_inta_msi: Get rid of ti_sci_inta_msi_get_virq() Just use the core function msi_get_virq(). Signed-off-by: Thomas Gleixner Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Acked-by: Arnd Bergmann Acked-by: Vinod Koul Acked-by: Nishanth Menon Link: https://lore.kernel.org/r/20211210221815.269468319@linutronix.de --- include/linux/soc/ti/ti_sci_inta_msi.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soc/ti/ti_sci_inta_msi.h b/include/linux/soc/ti/ti_sci_inta_msi.h index e3aa8b14612e..25ea78a8ea5c 100644 --- a/include/linux/soc/ti/ti_sci_inta_msi.h +++ b/include/linux/soc/ti/ti_sci_inta_msi.h @@ -18,6 +18,5 @@ struct irq_domain struct irq_domain *parent); int ti_sci_inta_msi_domain_alloc_irqs(struct device *dev, struct ti_sci_resource *res); -unsigned int ti_sci_inta_msi_get_virq(struct device *dev, u32 index); void ti_sci_inta_msi_domain_free_irqs(struct device *dev); #endif /* __INCLUDE_LINUX_IRQCHIP_TI_SCI_INTA_H */ -- cgit v1.2.3 From 125282cd4f33ecd53a24ae4807409da0e5e90fd4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:04 +0100 Subject: genirq/msi: Move descriptor list to struct msi_device_data It's only required when MSI is in use. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210747.650487479@linutronix.de --- include/linux/device.h | 4 ---- include/linux/msi.h | 4 +++- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index f0033cd93631..93459724dcde 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -423,7 +423,6 @@ struct dev_msi_info { * @pins: For device pin management. * See Documentation/driver-api/pin-control.rst for details. * @msi: MSI related data - * @msi_list: Hosts MSI descriptors * @numa_node: NUMA node this device is close to. * @dma_ops: DMA mapping operations for this device. * @dma_mask: Dma mask (if dma'ble device). @@ -519,9 +518,6 @@ struct device { struct dev_pin_info *pins; #endif struct dev_msi_info msi; -#ifdef CONFIG_GENERIC_MSI_IRQ - struct list_head msi_list; -#endif #ifdef CONFIG_DMA_OPS const struct dma_map_ops *dma_ops; #endif diff --git a/include/linux/msi.h b/include/linux/msi.h index 7593fc383dba..4223e47103ed 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -144,11 +144,13 @@ struct msi_desc { * @properties: MSI properties which are interesting to drivers * @attrs: Pointer to the sysfs attribute group * @platform_data: Platform-MSI specific data + * @list: List of MSI descriptors associated to the device */ struct msi_device_data { unsigned long properties; const struct attribute_group **attrs; struct platform_msi_priv_data *platform_data; + struct list_head list; }; int msi_setup_device_data(struct device *dev); @@ -157,7 +159,7 @@ unsigned int msi_get_virq(struct device *dev, unsigned int index); /* Helpers to hide struct msi_desc implementation details */ #define msi_desc_to_dev(desc) ((desc)->dev) -#define dev_to_msi_list(dev) (&(dev)->msi_list) +#define dev_to_msi_list(dev) (&(dev)->msi.data->list) #define first_msi_entry(dev) \ list_first_entry(dev_to_msi_list((dev)), struct msi_desc, list) #define for_each_msi_entry(desc, dev) \ -- cgit v1.2.3 From b5f687f97d1e112493fe0447a1fb09fbd93c334b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:05 +0100 Subject: genirq/msi: Add mutex for MSI list protection For upcoming runtime extensions of MSI-X interrupts it's required to protect the MSI descriptor list. Add a mutex to struct msi_device_data and provide lock/unlock functions. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210747.708877269@linutronix.de --- include/linux/msi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 4223e47103ed..2cf6c530588d 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -3,6 +3,7 @@ #define LINUX_MSI_H #include +#include #include #include @@ -145,17 +146,21 @@ struct msi_desc { * @attrs: Pointer to the sysfs attribute group * @platform_data: Platform-MSI specific data * @list: List of MSI descriptors associated to the device + * @mutex: Mutex protecting the MSI list */ struct msi_device_data { unsigned long properties; const struct attribute_group **attrs; struct platform_msi_priv_data *platform_data; struct list_head list; + struct mutex mutex; }; int msi_setup_device_data(struct device *dev); unsigned int msi_get_virq(struct device *dev, unsigned int index); +void msi_lock_descs(struct device *dev); +void msi_unlock_descs(struct device *dev); /* Helpers to hide struct msi_desc implementation details */ #define msi_desc_to_dev(desc) ((desc)->dev) -- cgit v1.2.3 From 0f62d941acf9ac3b6025692ce649b1f282b89e7f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:07 +0100 Subject: genirq/msi: Provide msi_domain_alloc/free_irqs_descs_locked() Usage sites which do allocations of the MSI descriptors before invoking msi_domain_alloc_irqs() require to lock the MSI decriptors accross the operation. Provide entry points which can be called with the MSI mutex held and lock the mutex in the existing entry points. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210747.765371053@linutronix.de --- include/linux/msi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 2cf6c530588d..69c588efe85b 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -383,9 +383,12 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, struct irq_domain *parent); int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec); +int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev, + int nvec); int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec); void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev); +void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev); void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev); struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain); -- cgit v1.2.3 From 1046f71d7268b1680d7b044dea83c664403f6302 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:08 +0100 Subject: genirq/msi: Provide a set of advanced MSI accessors and iterators In preparation for dynamic handling of MSI-X interrupts provide a new set of MSI descriptor accessor functions and iterators. They are benefitial per se as they allow to cleanup quite some code in various MSI domain implementations. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210747.818635078@linutronix.de --- include/linux/msi.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 69c588efe85b..703221f7e9ea 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -140,6 +140,18 @@ struct msi_desc { struct pci_msi_desc pci; }; +/* + * Filter values for the MSI descriptor iterators and accessor functions. + */ +enum msi_desc_filter { + /* All descriptors */ + MSI_DESC_ALL, + /* Descriptors which have no interrupt associated */ + MSI_DESC_NOTASSOCIATED, + /* Descriptors which have an interrupt associated */ + MSI_DESC_ASSOCIATED, +}; + /** * msi_device_data - MSI per device data * @properties: MSI properties which are interesting to drivers @@ -147,6 +159,7 @@ struct msi_desc { * @platform_data: Platform-MSI specific data * @list: List of MSI descriptors associated to the device * @mutex: Mutex protecting the MSI list + * @__next: Cached pointer to the next entry for iterators */ struct msi_device_data { unsigned long properties; @@ -154,6 +167,7 @@ struct msi_device_data { struct platform_msi_priv_data *platform_data; struct list_head list; struct mutex mutex; + struct msi_desc *__next; }; int msi_setup_device_data(struct device *dev); @@ -162,6 +176,25 @@ unsigned int msi_get_virq(struct device *dev, unsigned int index); void msi_lock_descs(struct device *dev); void msi_unlock_descs(struct device *dev); +struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter); +struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter); + +/** + * msi_for_each_desc - Iterate the MSI descriptors + * + * @desc: struct msi_desc pointer used as iterator + * @dev: struct device pointer - device to iterate + * @filter: Filter for descriptor selection + * + * Notes: + * - The loop must be protected with a msi_lock_descs()/msi_unlock_descs() + * pair. + * - It is safe to remove a retrieved MSI descriptor in the loop. + */ +#define msi_for_each_desc(desc, dev, filter) \ + for ((desc) = msi_first_desc((dev), (filter)); (desc); \ + (desc) = msi_next_desc((dev), (filter))) + /* Helpers to hide struct msi_desc implementation details */ #define msi_desc_to_dev(desc) ((desc)->dev) #define dev_to_msi_list(dev) (&(dev)->msi.data->list) -- cgit v1.2.3 From 602905253607ba892336f7bba8bb45b5be819d87 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:10 +0100 Subject: genirq/msi: Provide msi_alloc_msi_desc() and a simple allocator Provide msi_alloc_msi_desc() which takes a template MSI descriptor for initializing a newly allocated descriptor. This allows to simplify various usage sites of alloc_msi_entry() and moves the storage handling into the core code. For simple cases where only a linear vector space is required provide msi_add_simple_msi_descs() which just allocates a linear range of MSI descriptors and fills msi_desc::msi_index accordingly. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210747.873833567@linutronix.de --- include/linux/msi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 703221f7e9ea..bbb8c1e2c18b 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -247,6 +247,8 @@ static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg) } #endif /* CONFIG_PCI_MSI */ +int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc); + struct msi_desc *alloc_msi_entry(struct device *dev, int nvec, const struct irq_affinity_desc *affinity); void free_msi_entry(struct msi_desc *entry); -- cgit v1.2.3 From 645474e2cee450131e8b8d8a69a5d9bbabd43f3f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:12 +0100 Subject: genirq/msi: Provide domain flags to allocate/free MSI descriptors automatically Provide domain info flags which tell the core to allocate simple descriptors or to free descriptors when the interrupts are freed and implement the required functionality. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210747.928198636@linutronix.de --- include/linux/msi.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index bbb8c1e2c18b..17e47ab8d57a 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -105,6 +105,8 @@ struct pci_msi_desc { }; }; +#define MSI_MAX_INDEX ((unsigned int)USHRT_MAX) + /** * struct msi_desc - Descriptor structure for MSI based interrupts * @list: List head for management @@ -248,6 +250,17 @@ static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg) #endif /* CONFIG_PCI_MSI */ int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc); +void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter, + unsigned int first_index, unsigned int last_index); + +/** + * msi_free_msi_descs - Free MSI descriptors of a device + * @dev: Device to free the descriptors + */ +static inline void msi_free_msi_descs(struct device *dev) +{ + msi_free_msi_descs_range(dev, MSI_DESC_ALL, 0, MSI_MAX_INDEX); +} struct msi_desc *alloc_msi_entry(struct device *dev, int nvec, const struct irq_affinity_desc *affinity); @@ -408,6 +421,10 @@ enum { MSI_FLAG_DEV_SYSFS = (1 << 7), /* MSI-X entries must be contiguous */ MSI_FLAG_MSIX_CONTIGUOUS = (1 << 8), + /* Allocate simple MSI descriptors */ + MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS = (1 << 9), + /* Free MSI descriptors */ + MSI_FLAG_FREE_MSI_DESCS = (1 << 10), }; int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask, -- cgit v1.2.3 From 7ad321a5eadb52b4af1c577dda51783e08235ea7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:37 +0100 Subject: soc: ti: ti_sci_inta_msi: Remove ti_sci_inta_msi_domain_free_irqs() The function has no users and is pointless now that the core frees the MSI descriptors, which means potential users can just use msi_domain_free_irqs(). Signed-off-by: Thomas Gleixner Tested-by: Nishanth Menon Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210748.793119155@linutronix.de --- include/linux/soc/ti/ti_sci_inta_msi.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soc/ti/ti_sci_inta_msi.h b/include/linux/soc/ti/ti_sci_inta_msi.h index 25ea78a8ea5c..4dba2f2aff6f 100644 --- a/include/linux/soc/ti/ti_sci_inta_msi.h +++ b/include/linux/soc/ti/ti_sci_inta_msi.h @@ -18,5 +18,4 @@ struct irq_domain struct irq_domain *parent); int ti_sci_inta_msi_domain_alloc_irqs(struct device *dev, struct ti_sci_resource *res); -void ti_sci_inta_msi_domain_free_irqs(struct device *dev); #endif /* __INCLUDE_LINUX_IRQCHIP_TI_SCI_INTA_H */ -- cgit v1.2.3 From ef8dd01538ea2553ab101ddce6a85a321406d9c0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:44 +0100 Subject: genirq/msi: Make interrupt allocation less convoluted There is no real reason to do several loops over the MSI descriptors instead of just doing one loop. In case of an error everything is undone anyway so it does not matter whether it's a partial or a full rollback. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210749.010234767@linutronix.de --- include/linux/msi.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 17e47ab8d57a..e8dd0be17e89 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -206,12 +206,6 @@ struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter); list_for_each_entry((desc), dev_to_msi_list((dev)), list) #define for_each_msi_entry_safe(desc, tmp, dev) \ list_for_each_entry_safe((desc), (tmp), dev_to_msi_list((dev)), list) -#define for_each_msi_vector(desc, __irq, dev) \ - for_each_msi_entry((desc), (dev)) \ - if ((desc)->irq) \ - for (__irq = (desc)->irq; \ - __irq < ((desc)->irq + (desc)->nvec_used); \ - __irq++) #ifdef CONFIG_IRQ_MSI_IOMMU static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) -- cgit v1.2.3 From cc9a246dbf6bdef56d9eee296a1db52dd0607976 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:47 +0100 Subject: genirq/msi: Mop up old interfaces Get rid of the old iterators, alloc/free functions and adjust the core code accordingly. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210749.117395027@linutronix.de --- include/linux/msi.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index e8dd0be17e89..b54010ba7b0d 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -197,15 +197,7 @@ struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter); for ((desc) = msi_first_desc((dev), (filter)); (desc); \ (desc) = msi_next_desc((dev), (filter))) -/* Helpers to hide struct msi_desc implementation details */ #define msi_desc_to_dev(desc) ((desc)->dev) -#define dev_to_msi_list(dev) (&(dev)->msi.data->list) -#define first_msi_entry(dev) \ - list_first_entry(dev_to_msi_list((dev)), struct msi_desc, list) -#define for_each_msi_entry(desc, dev) \ - list_for_each_entry((desc), dev_to_msi_list((dev)), list) -#define for_each_msi_entry_safe(desc, tmp, dev) \ - list_for_each_entry_safe((desc), (tmp), dev_to_msi_list((dev)), list) #ifdef CONFIG_IRQ_MSI_IOMMU static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) @@ -231,10 +223,6 @@ static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, #endif #ifdef CONFIG_PCI_MSI -#define first_pci_msi_entry(pdev) first_msi_entry(&(pdev)->dev) -#define for_each_pci_msi_entry(desc, pdev) \ - for_each_msi_entry((desc), &(pdev)->dev) - struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc); void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg); #else /* CONFIG_PCI_MSI */ @@ -256,9 +244,6 @@ static inline void msi_free_msi_descs(struct device *dev) msi_free_msi_descs_range(dev, MSI_DESC_ALL, 0, MSI_MAX_INDEX); } -struct msi_desc *alloc_msi_entry(struct device *dev, int nvec, - const struct irq_affinity_desc *affinity); -void free_msi_entry(struct msi_desc *entry); void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); -- cgit v1.2.3 From ef3350c53d2aac65cf1c4ecc968bbb1de5f421ea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:49 +0100 Subject: genirq/msi: Add abuse prevention comment to msi header Hope dies last. Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210749.170847844@linutronix.de --- include/linux/msi.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index b54010ba7b0d..70cc6a555a8e 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -2,6 +2,20 @@ #ifndef LINUX_MSI_H #define LINUX_MSI_H +/* + * This header file contains MSI data structures and functions which are + * only relevant for: + * - Interrupt core code + * - PCI/MSI core code + * - MSI interrupt domain implementations + * - IOMMU, low level VFIO, NTB and other justified exceptions + * dealing with low level MSI details. + * + * Regular device drivers have no business with any of these functions and + * especially storing MSI descriptor pointers in random code is considered + * abuse. The only function which is relevant for drivers is msi_get_virq(). + */ + #include #include #include -- cgit v1.2.3 From bf5e758f02fc739589dcc6a3395c3a3eb77b5c90 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:50 +0100 Subject: genirq/msi: Simplify sysfs handling The sysfs handling for MSI is a convoluted maze and it is in the way of supporting dynamic expansion of the MSI-X vectors because it only supports a one off bulk population/free of the sysfs entries. Change it to do: 1) Creating an empty sysfs attribute group when msi_device_data is allocated 2) Populate the entries when the MSI descriptor is initialized 3) Free the entries when a MSI descriptor is detached from a Linux interrupt. 4) Provide functions for the legacy non-irqdomain fallback code to do a bulk population/free. This code won't support dynamic expansion. This makes the code simpler and reduces the number of allocations as the empty attribute group can be shared. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210749.224917330@linutronix.de --- include/linux/msi.h | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 70cc6a555a8e..1a00367d2cfa 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -71,7 +71,7 @@ struct irq_data; struct msi_desc; struct pci_dev; struct platform_msi_priv_data; -struct attribute_group; +struct device_attribute; void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg); #ifdef CONFIG_GENERIC_MSI_IRQ @@ -129,6 +129,7 @@ struct pci_msi_desc { * @dev: Pointer to the device which uses this descriptor * @msg: The last set MSI message cached for reuse * @affinity: Optional pointer to a cpu affinity mask for this descriptor + * @sysfs_attr: Pointer to sysfs device attribute * * @write_msi_msg: Callback that may be called when the MSI message * address or data changes @@ -148,6 +149,9 @@ struct msi_desc { #ifdef CONFIG_IRQ_MSI_IOMMU const void *iommu_cookie; #endif +#ifdef CONFIG_SYSFS + struct device_attribute *sysfs_attrs; +#endif void (*write_msi_msg)(struct msi_desc *entry, void *data); void *write_msi_msg_data; @@ -171,7 +175,6 @@ enum msi_desc_filter { /** * msi_device_data - MSI per device data * @properties: MSI properties which are interesting to drivers - * @attrs: Pointer to the sysfs attribute group * @platform_data: Platform-MSI specific data * @list: List of MSI descriptors associated to the device * @mutex: Mutex protecting the MSI list @@ -179,7 +182,6 @@ enum msi_desc_filter { */ struct msi_device_data { unsigned long properties; - const struct attribute_group **attrs; struct platform_msi_priv_data *platform_data; struct list_head list; struct mutex mutex; @@ -264,14 +266,6 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void pci_msi_mask_irq(struct irq_data *data); void pci_msi_unmask_irq(struct irq_data *data); -#ifdef CONFIG_SYSFS -int msi_device_populate_sysfs(struct device *dev); -void msi_device_destroy_sysfs(struct device *dev); -#else /* CONFIG_SYSFS */ -static inline int msi_device_populate_sysfs(struct device *dev) { return 0; } -static inline void msi_device_destroy_sysfs(struct device *dev) { } -#endif /* !CONFIG_SYSFS */ - /* * The arch hooks to setup up msi irqs. Default functions are implemented * as weak symbols so that they /can/ be overriden by architecture specific @@ -285,6 +279,13 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc); void arch_teardown_msi_irq(unsigned int irq); int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void arch_teardown_msi_irqs(struct pci_dev *dev); +#ifdef CONFIG_SYSFS +int msi_device_populate_sysfs(struct device *dev); +void msi_device_destroy_sysfs(struct device *dev); +#else /* CONFIG_SYSFS */ +static inline int msi_device_populate_sysfs(struct device *dev) { return 0; } +static inline void msi_device_destroy_sysfs(struct device *dev) { } +#endif /* !CONFIG_SYSFS */ #endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */ /* -- cgit v1.2.3 From cd6cf06590b9792340dceaa285138777f3cc4d90 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:52 +0100 Subject: genirq/msi: Convert storage to xarray The current linked list storage for MSI descriptors is suboptimal in several ways: 1) Looking up a MSI desciptor requires a O(n) list walk in the worst case 2) The upcoming support of runtime expansion of MSI-X vectors would need to do a full list walk to figure out whether a particular index is already associated. 3) Runtime expansion of sparse allocations is even more complex as the current implementation assumes an ordered list (increasing MSI index). Use an xarray which solves all of the above problems nicely. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210749.280627070@linutronix.de --- include/linux/msi.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 1a00367d2cfa..fc918a658d48 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -17,6 +17,7 @@ */ #include +#include #include #include #include @@ -123,7 +124,6 @@ struct pci_msi_desc { /** * struct msi_desc - Descriptor structure for MSI based interrupts - * @list: List head for management * @irq: The base interrupt number * @nvec_used: The number of vectors used * @dev: Pointer to the device which uses this descriptor @@ -140,7 +140,6 @@ struct pci_msi_desc { */ struct msi_desc { /* Shared device/bus type independent data */ - struct list_head list; unsigned int irq; unsigned int nvec_used; struct device *dev; @@ -176,16 +175,16 @@ enum msi_desc_filter { * msi_device_data - MSI per device data * @properties: MSI properties which are interesting to drivers * @platform_data: Platform-MSI specific data - * @list: List of MSI descriptors associated to the device - * @mutex: Mutex protecting the MSI list - * @__next: Cached pointer to the next entry for iterators + * @mutex: Mutex protecting the MSI descriptor store + * @__store: Xarray for storing MSI descriptor pointers + * @__iter_idx: Index to search the next entry for iterators */ struct msi_device_data { unsigned long properties; struct platform_msi_priv_data *platform_data; - struct list_head list; struct mutex mutex; - struct msi_desc *__next; + struct xarray __store; + unsigned long __iter_idx; }; int msi_setup_device_data(struct device *dev); -- cgit v1.2.3 From 60f20d84dc813f1342771a3e4f06d89da26dc412 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 18 Nov 2021 12:12:10 -0600 Subject: of/fdt: Rework early_init_dt_scan_chosen() to call directly Use of the of_scan_flat_dt() function predates libfdt and is discouraged as libfdt provides a nicer set of APIs. Rework early_init_dt_scan_chosen() to be called directly and use libfdt. Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Frank Rowand Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Rob Herring Reviewed-by: Frank Rowand Link: https://lore.kernel.org/r/20211118181213.1433346-2-robh@kernel.org --- include/linux/of_fdt.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index cf48983d3c86..654722235df6 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -58,8 +58,7 @@ extern int of_flat_dt_is_compatible(unsigned long node, const char *name); extern unsigned long of_get_flat_dt_root(void); extern uint32_t of_get_flat_dt_phandle(unsigned long node); -extern int early_init_dt_scan_chosen(unsigned long node, const char *uname, - int depth, void *data); +extern int early_init_dt_scan_chosen(char *cmdline); extern int early_init_dt_scan_memory(unsigned long node, const char *uname, int depth, void *data); extern int early_init_dt_scan_chosen_stdout(void); -- cgit v1.2.3 From d665881d2171b62ca1ea23be89be6f2a8a330bb2 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 18 Nov 2021 12:12:11 -0600 Subject: of/fdt: Rework early_init_dt_scan_root() to call directly Use of the of_scan_flat_dt() function predates libfdt and is discouraged as libfdt provides a nicer set of APIs. Rework early_init_dt_scan_root() to be called directly and use libfdt. Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Frank Rowand Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Rob Herring Reviewed-by: Frank Rowand Link: https://lore.kernel.org/r/20211118181213.1433346-3-robh@kernel.org --- include/linux/of_fdt.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 654722235df6..df3d31926c3c 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -68,8 +68,7 @@ extern void early_init_dt_add_memory_arch(u64 base, u64 size); extern u64 dt_mem_next_cell(int s, const __be32 **cellp); /* Early flat tree scan hooks */ -extern int early_init_dt_scan_root(unsigned long node, const char *uname, - int depth, void *data); +extern int early_init_dt_scan_root(void); extern bool early_init_dt_scan(void *params); extern bool early_init_dt_verify(void *params); -- cgit v1.2.3 From 1f012283e9360fb4007308f04cfaeb205e34b684 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 15 Dec 2021 09:01:02 -0600 Subject: of/fdt: Rework early_init_dt_scan_memory() to call directly Use of the of_scan_flat_dt() function predates libfdt and is discouraged as libfdt provides a nicer set of APIs. Rework early_init_dt_scan_memory() to be called directly and use libfdt. Cc: John Crispin Cc: Thomas Bogendoerfer Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Frank Rowand Cc: linux-mips@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Reviewed-by: Frank Rowand Signed-off-by: Rob Herring Tested-by: Michael Ellerman Link: https://lore.kernel.org/r/20211215150102.1303588-1-robh@kernel.org --- include/linux/of_fdt.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index df3d31926c3c..914739f3c192 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -59,8 +59,7 @@ extern unsigned long of_get_flat_dt_root(void); extern uint32_t of_get_flat_dt_phandle(unsigned long node); extern int early_init_dt_scan_chosen(char *cmdline); -extern int early_init_dt_scan_memory(unsigned long node, const char *uname, - int depth, void *data); +extern int early_init_dt_scan_memory(void); extern int early_init_dt_scan_chosen_stdout(void); extern void early_init_fdt_scan_reserved_mem(void); extern void early_init_fdt_reserve_self(void); -- cgit v1.2.3 From f7ea534a0920dbaf71a8003936e178e14ec9271d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 15 Dec 2021 18:55:36 -0800 Subject: add includes masked by cgroup -> bpf dependency cgroup pulls in BPF which pulls in a lot of includes. We're about to break that chain so fix those who were depending on it. Signed-off-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211216025538.1649516-2-kuba@kernel.org --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0dcfd265beed..4a021149eaf0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -611,6 +611,7 @@ struct swevent_hlist { #define PERF_ATTACH_SCHED_CB 0x20 #define PERF_ATTACH_CHILD 0x40 +struct bpf_prog; struct perf_cgroup; struct perf_buffer; -- cgit v1.2.3 From fd1740b6abac39f68ce12e201697f106e0f1d519 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 15 Dec 2021 18:55:38 -0800 Subject: bpf: Remove the cgroup -> bpf header dependecy Remove the dependency from cgroup-defs.h to bpf-cgroup.h and bpf.h. This reduces the incremental build size of x86 allmodconfig after bpf.h was touched from ~17k objects rebuilt to ~5k objects. bpf.h is 2.2kLoC and is modified relatively often. We need a new header with just the definition of struct cgroup_bpf and enum cgroup_bpf_attach_type, this is akin to cgroup-defs.h. Signed-off-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Acked-by: Tejun Heo Link: https://lore.kernel.org/bpf/20211216025538.1649516-4-kuba@kernel.org --- include/linux/bpf-cgroup-defs.h | 70 +++++++++++++++++++++++++++++++++++++++++ include/linux/bpf-cgroup.h | 57 +-------------------------------- include/linux/cgroup-defs.h | 2 +- 3 files changed, 72 insertions(+), 57 deletions(-) create mode 100644 include/linux/bpf-cgroup-defs.h (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h new file mode 100644 index 000000000000..695d1224a71b --- /dev/null +++ b/include/linux/bpf-cgroup-defs.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BPF_CGROUP_DEFS_H +#define _BPF_CGROUP_DEFS_H + +#ifdef CONFIG_CGROUP_BPF + +#include +#include +#include + +struct bpf_prog_array; + +enum cgroup_bpf_attach_type { + CGROUP_BPF_ATTACH_TYPE_INVALID = -1, + CGROUP_INET_INGRESS = 0, + CGROUP_INET_EGRESS, + CGROUP_INET_SOCK_CREATE, + CGROUP_SOCK_OPS, + CGROUP_DEVICE, + CGROUP_INET4_BIND, + CGROUP_INET6_BIND, + CGROUP_INET4_CONNECT, + CGROUP_INET6_CONNECT, + CGROUP_INET4_POST_BIND, + CGROUP_INET6_POST_BIND, + CGROUP_UDP4_SENDMSG, + CGROUP_UDP6_SENDMSG, + CGROUP_SYSCTL, + CGROUP_UDP4_RECVMSG, + CGROUP_UDP6_RECVMSG, + CGROUP_GETSOCKOPT, + CGROUP_SETSOCKOPT, + CGROUP_INET4_GETPEERNAME, + CGROUP_INET6_GETPEERNAME, + CGROUP_INET4_GETSOCKNAME, + CGROUP_INET6_GETSOCKNAME, + CGROUP_INET_SOCK_RELEASE, + MAX_CGROUP_BPF_ATTACH_TYPE +}; + +struct cgroup_bpf { + /* array of effective progs in this cgroup */ + struct bpf_prog_array __rcu *effective[MAX_CGROUP_BPF_ATTACH_TYPE]; + + /* attached progs to this cgroup and attach flags + * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will + * have either zero or one element + * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS + */ + struct list_head progs[MAX_CGROUP_BPF_ATTACH_TYPE]; + u32 flags[MAX_CGROUP_BPF_ATTACH_TYPE]; + + /* list of cgroup shared storages */ + struct list_head storages; + + /* temp storage for effective prog array used by prog_attach/detach */ + struct bpf_prog_array *inactive; + + /* reference counter used to detach bpf programs after cgroup removal */ + struct percpu_ref refcnt; + + /* cgroup_bpf is released using a work queue */ + struct work_struct release_work; +}; + +#else /* CONFIG_CGROUP_BPF */ +struct cgroup_bpf {}; +#endif /* CONFIG_CGROUP_BPF */ + +#endif diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 11820a430d6c..b525d8cdc25b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -3,10 +3,10 @@ #define _BPF_CGROUP_H #include +#include #include #include #include -#include #include #include @@ -23,33 +23,6 @@ struct ctl_table_header; struct task_struct; #ifdef CONFIG_CGROUP_BPF -enum cgroup_bpf_attach_type { - CGROUP_BPF_ATTACH_TYPE_INVALID = -1, - CGROUP_INET_INGRESS = 0, - CGROUP_INET_EGRESS, - CGROUP_INET_SOCK_CREATE, - CGROUP_SOCK_OPS, - CGROUP_DEVICE, - CGROUP_INET4_BIND, - CGROUP_INET6_BIND, - CGROUP_INET4_CONNECT, - CGROUP_INET6_CONNECT, - CGROUP_INET4_POST_BIND, - CGROUP_INET6_POST_BIND, - CGROUP_UDP4_SENDMSG, - CGROUP_UDP6_SENDMSG, - CGROUP_SYSCTL, - CGROUP_UDP4_RECVMSG, - CGROUP_UDP6_RECVMSG, - CGROUP_GETSOCKOPT, - CGROUP_SETSOCKOPT, - CGROUP_INET4_GETPEERNAME, - CGROUP_INET6_GETPEERNAME, - CGROUP_INET4_GETSOCKNAME, - CGROUP_INET6_GETSOCKNAME, - CGROUP_INET_SOCK_RELEASE, - MAX_CGROUP_BPF_ATTACH_TYPE -}; #define CGROUP_ATYPE(type) \ case BPF_##type: return type @@ -127,33 +100,6 @@ struct bpf_prog_list { struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; }; -struct bpf_prog_array; - -struct cgroup_bpf { - /* array of effective progs in this cgroup */ - struct bpf_prog_array __rcu *effective[MAX_CGROUP_BPF_ATTACH_TYPE]; - - /* attached progs to this cgroup and attach flags - * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will - * have either zero or one element - * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS - */ - struct list_head progs[MAX_CGROUP_BPF_ATTACH_TYPE]; - u32 flags[MAX_CGROUP_BPF_ATTACH_TYPE]; - - /* list of cgroup shared storages */ - struct list_head storages; - - /* temp storage for effective prog array used by prog_attach/detach */ - struct bpf_prog_array *inactive; - - /* reference counter used to detach bpf programs after cgroup removal */ - struct percpu_ref refcnt; - - /* cgroup_bpf is released using a work queue */ - struct work_struct release_work; -}; - int cgroup_bpf_inherit(struct cgroup *cgrp); void cgroup_bpf_offline(struct cgroup *cgrp); @@ -451,7 +397,6 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); #else -struct cgroup_bpf {}; static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } static inline void cgroup_bpf_offline(struct cgroup *cgrp) {} diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index db2e147e069f..411684c80cf3 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #ifdef CONFIG_CGROUPS -- cgit v1.2.3 From 0f55f9ed21f96630c6ec96805d42f92c0b458b37 Mon Sep 17 00:00:00 2001 From: Christy Lee Date: Thu, 16 Dec 2021 13:33:56 -0800 Subject: bpf: Only print scratched registers and stack slots to verifier logs. When printing verifier state for any log level, print full verifier state only on function calls or on errors. Otherwise, only print the registers and stack slots that were accessed. Log size differences: verif_scale_loop6 before: 234566564 verif_scale_loop6 after: 72143943 69% size reduction kfree_skb before: 166406 kfree_skb after: 55386 69% size reduction Before: 156: (61) r0 = *(u32 *)(r1 +0) 157: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=ctx(id=0,off=0,imm=0) R2_w=invP0 R10=fp0 fp-8_w=00000000 fp-16_w=00\ 000000 fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000 fp-56_w=00000000 fp-64_w=00000000 fp-72_w=00000000 fp-80_w=00000\ 000 fp-88_w=00000000 fp-96_w=00000000 fp-104_w=00000000 fp-112_w=00000000 fp-120_w=00000000 fp-128_w=00000000 fp-136_w=00000000 fp-144_w=00\ 000000 fp-152_w=00000000 fp-160_w=00000000 fp-168_w=00000000 fp-176_w=00000000 fp-184_w=00000000 fp-192_w=00000000 fp-200_w=00000000 fp-208\ _w=00000000 fp-216_w=00000000 fp-224_w=00000000 fp-232_w=00000000 fp-240_w=00000000 fp-248_w=00000000 fp-256_w=00000000 fp-264_w=00000000 f\ p-272_w=00000000 fp-280_w=00000000 fp-288_w=00000000 fp-296_w=00000000 fp-304_w=00000000 fp-312_w=00000000 fp-320_w=00000000 fp-328_w=00000\ 000 fp-336_w=00000000 fp-344_w=00000000 fp-352_w=00000000 fp-360_w=00000000 fp-368_w=00000000 fp-376_w=00000000 fp-384_w=00000000 fp-392_w=\ 00000000 fp-400_w=00000000 fp-408_w=00000000 fp-416_w=00000000 fp-424_w=00000000 fp-432_w=00000000 fp-440_w=00000000 fp-448_w=00000000 ; return skb->len; 157: (95) exit Func#4 is safe for any args that match its prototype Validating get_constant() func#5... 158: R1=invP(id=0) R10=fp0 ; int get_constant(long val) 158: (bf) r0 = r1 159: R0_w=invP(id=1) R1=invP(id=1) R10=fp0 ; return val - 122; 159: (04) w0 += -122 160: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=invP(id=1) R10=fp0 ; return val - 122; 160: (95) exit Func#5 is safe for any args that match its prototype Validating get_skb_ifindex() func#6... 161: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R3=invP(id=0) R10=fp0 ; int get_skb_ifindex(int val, struct __sk_buff *skb, int var) 161: (bc) w0 = w3 162: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R3=invP(id=0) R10=fp0 After: 156: (61) r0 = *(u32 *)(r1 +0) 157: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=ctx(id=0,off=0,imm=0) ; return skb->len; 157: (95) exit Func#4 is safe for any args that match its prototype Validating get_constant() func#5... 158: R1=invP(id=0) R10=fp0 ; int get_constant(long val) 158: (bf) r0 = r1 159: R0_w=invP(id=1) R1=invP(id=1) ; return val - 122; 159: (04) w0 += -122 160: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) ; return val - 122; 160: (95) exit Func#5 is safe for any args that match its prototype Validating get_skb_ifindex() func#6... 161: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R3=invP(id=0) R10=fp0 ; int get_skb_ifindex(int val, struct __sk_buff *skb, int var) 161: (bc) w0 = w3 162: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R3=invP(id=0) Signed-off-by: Christy Lee Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211216213358.3374427-2-christylee@fb.com --- include/linux/bpf_verifier.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 182b16a91084..c66f238c538d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -474,6 +474,13 @@ struct bpf_verifier_env { /* longest register parentage chain walked for liveness marking */ u32 longest_mark_read_walk; bpfptr_t fd_array; + + /* bit mask to keep track of whether a register has been accessed + * since the last time the function state was printed + */ + u32 scratched_regs; + /* Same as scratched_regs but for stack slots */ + u64 scratched_stack_slots; }; __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, -- cgit v1.2.3 From 2e5766483c8c5cf886b4dc647a1741738dde7d79 Mon Sep 17 00:00:00 2001 From: Christy Lee Date: Thu, 16 Dec 2021 19:42:45 -0800 Subject: bpf: Right align verifier states in verifier logs. Make the verifier logs more readable, print the verifier states on the corresponding instruction line. If the previous line was not a bpf instruction, then print the verifier states on its own line. Before: Validating test_pkt_access_subprog3() func#3... 86: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R10=fp0 ; int test_pkt_access_subprog3(int val, struct __sk_buff *skb) 86: (bf) r6 = r2 87: R2=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0) 87: (bc) w7 = w1 88: R1=invP(id=0) R7_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) ; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123)); 88: (bf) r1 = r6 89: R1_w=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0) 89: (85) call pc+9 Func#4 is global and valid. Skipping. 90: R0_w=invP(id=0) 90: (bc) w8 = w0 91: R0_w=invP(id=0) R8_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) ; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123)); 91: (b7) r1 = 123 92: R1_w=invP123 92: (85) call pc+65 Func#5 is global and valid. Skipping. 93: R0=invP(id=0) After: 86: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R10=fp0 ; int test_pkt_access_subprog3(int val, struct __sk_buff *skb) 86: (bf) r6 = r2 ; R2=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0) 87: (bc) w7 = w1 ; R1=invP(id=0) R7_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) ; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123)); 88: (bf) r1 = r6 ; R1_w=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0) 89: (85) call pc+9 Func#4 is global and valid. Skipping. 90: R0_w=invP(id=0) 90: (bc) w8 = w0 ; R0_w=invP(id=0) R8_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) ; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123)); 91: (b7) r1 = 123 ; R1_w=invP123 92: (85) call pc+65 Func#5 is global and valid. Skipping. 93: R0=invP(id=0) Signed-off-by: Christy Lee Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c66f238c538d..ee931398f311 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -388,6 +388,8 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) #define BPF_LOG_LEVEL (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2) #define BPF_LOG_MASK (BPF_LOG_LEVEL | BPF_LOG_STATS) #define BPF_LOG_KERNEL (BPF_LOG_MASK + 1) /* kernel internal flag */ +#define BPF_LOG_MIN_ALIGNMENT 8U +#define BPF_LOG_ALIGNMENT 40U static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { @@ -481,6 +483,7 @@ struct bpf_verifier_env { u32 scratched_regs; /* Same as scratched_regs but for stack slots */ u64 scratched_stack_slots; + u32 prev_log_len, prev_insn_print_len; }; __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, -- cgit v1.2.3 From 03de6b273805b3c552ff158f8688555937375926 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Nov 2021 23:22:00 +0100 Subject: dmaengine: qcom-adm: stop abusing slave_id config The slave_id was previously used to pick one DMA slave instead of another, but this is now done through the DMA descriptors in device tree. For the qcom_adm driver, the configuration is documented in the DT binding to contain a tuple of device identifier and a "crci" field, but the implementation ends up using only a single cell for identifying the slave, with the crci getting passed in nonstandard properties of the device, and passed through the dma driver using the old slave_id field. Part of the problem apparently is that the nand driver ends up using only a single DMA request ID, but requires distinct values for "crci" depending on the type of transfer. Change both the dmaengine driver and the two slave drivers to allow the documented binding to work in addition to the ad-hoc passing of crci values. In order to no longer abuse the slave_id field, pass the data using the "peripheral_config" mechanism instead. Signed-off-by: Arnd Bergmann Acked-by: Mark Brown Link: https://lore.kernel.org/r/20211122222203.4103644-9-arnd@kernel.org Signed-off-by: Vinod Koul --- include/linux/dma/qcom_adm.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 include/linux/dma/qcom_adm.h (limited to 'include/linux') diff --git a/include/linux/dma/qcom_adm.h b/include/linux/dma/qcom_adm.h new file mode 100644 index 000000000000..af20df674f0c --- /dev/null +++ b/include/linux/dma/qcom_adm.h @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef __LINUX_DMA_QCOM_ADM_H +#define __LINUX_DMA_QCOM_ADM_H + +#include + +struct qcom_adm_peripheral_config { + u32 crci; + u32 mux; +}; + +#endif /* __LINUX_DMA_QCOM_ADM_H */ -- cgit v1.2.3 From 93cdb5b0dc56cc7a8b87a61146495f3bdc93d7ba Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Nov 2021 23:22:01 +0100 Subject: dmaengine: xilinx_dpdma: stop using slave_id field The display driver wants to pass a custom flag to the DMA engine driver, which it started doing by using the slave_id field that was traditionally used for a different purpose. As there is no longer a correct use for the slave_id field, it should really be removed, and the remaining users changed over to something different. The new mechanism for passing nonstandard settings is using the .peripheral_config field, so use that to pass a newly defined structure here, making it clear that this will not work in portable drivers. Reviewed-by: Laurent Pinchart Signed-off-by: Arnd Bergmann Acked-by: Mark Brown Link: https://lore.kernel.org/r/20211122222203.4103644-10-arnd@kernel.org Signed-off-by: Vinod Koul --- include/linux/dma/xilinx_dpdma.h | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 include/linux/dma/xilinx_dpdma.h (limited to 'include/linux') diff --git a/include/linux/dma/xilinx_dpdma.h b/include/linux/dma/xilinx_dpdma.h new file mode 100644 index 000000000000..83a1377f03f8 --- /dev/null +++ b/include/linux/dma/xilinx_dpdma.h @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __LINUX_DMA_XILINX_DPDMA_H +#define __LINUX_DMA_XILINX_DPDMA_H + +#include + +struct xilinx_dpdma_peripheral_config { + bool video_group; +}; + +#endif /* __LINUX_DMA_XILINX_DPDMA_H */ -- cgit v1.2.3 From 3c219644075795a99271d345efdfa8b256e55161 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Nov 2021 23:22:03 +0100 Subject: dmaengine: remove slave_id config field All references to the slave_id field have been removed, so remove the field as well to prevent new references from creeping in again. Originally this allowed slave DMA drivers to configure which device is accessed with the dmaengine_slave_config() call, but this was inconsistent, as the same information is also passed while requesting a channel, and never changes in practice. In modern kernels, the device is always selected when requesting the channel, so the .slave_id field is no longer useful. Reviewed-by: Laurent Pinchart Signed-off-by: Arnd Bergmann Acked-by: Mark Brown Link: https://lore.kernel.org/r/20211122222203.4103644-12-arnd@kernel.org Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 9000f3ffce8b..0349b35235e6 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -418,9 +418,6 @@ enum dma_slave_buswidth { * @device_fc: Flow Controller Settings. Only valid for slave channels. Fill * with 'true' if peripheral should be flow controller. Direction will be * selected at Runtime. - * @slave_id: Slave requester id. Only valid for slave channels. The dma - * slave peripheral will have unique id as dma requester which need to be - * pass as slave config. * @peripheral_config: peripheral configuration for programming peripheral * for dmaengine transfer * @peripheral_size: peripheral configuration buffer size @@ -448,7 +445,6 @@ struct dma_slave_config { u32 src_port_window_size; u32 dst_port_window_size; bool device_fc; - unsigned int slave_id; void *peripheral_config; size_t peripheral_size; }; -- cgit v1.2.3 From 3d725965f836a7acbd1674e33644bec18373de53 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 7 Dec 2021 15:33:06 -0800 Subject: crypto: ccp - Add SEV_INIT_EX support Add new module parameter to allow users to use SEV_INIT_EX instead of SEV_INIT. This helps users who lock their SPI bus to use the PSP for SEV functionality. The 'init_ex_path' parameter defaults to NULL which means the kernel will use SEV_INIT, if a path is specified SEV_INIT_EX will be used with the data found at the path. On certain PSP commands this file is written to as the PSP updates the NV memory region. Depending on file system initialization this file open may fail during module init but the CCP driver for SEV already has sufficient retries for platform initialization. During normal operation of PSP system and SEV commands if the PSP has not been initialized it is at run time. If the file at 'init_ex_path' does not exist the PSP will not be initialized. The user must create the file prior to use with 32Kb of 0xFFs per spec. Signed-off-by: David Rientjes Co-developed-by: Peter Gonda Signed-off-by: Peter Gonda Reviewed-by: Marc Orr Reported-by: kernel test robot Acked-by: Brijesh Singh Cc: Tom Lendacky Cc: Brijesh Singh Cc: Marc Orr Cc: Joerg Roedel Cc: Herbert Xu Cc: David Rientjes Cc: John Allen Cc: "David S. Miller" Cc: Paolo Bonzini Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- include/linux/psp-sev.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index d48a7192e881..1595088c428b 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -52,6 +52,7 @@ enum sev_cmd { SEV_CMD_DF_FLUSH = 0x00A, SEV_CMD_DOWNLOAD_FIRMWARE = 0x00B, SEV_CMD_GET_ID = 0x00C, + SEV_CMD_INIT_EX = 0x00D, /* Guest commands */ SEV_CMD_DECOMMISSION = 0x020, @@ -102,6 +103,26 @@ struct sev_data_init { u32 tmr_len; /* In */ } __packed; +/** + * struct sev_data_init_ex - INIT_EX command parameters + * + * @length: len of the command buffer read by the PSP + * @flags: processing flags + * @tmr_address: system physical address used for SEV-ES + * @tmr_len: len of tmr_address + * @nv_address: system physical address used for PSP NV storage + * @nv_len: len of nv_address + */ +struct sev_data_init_ex { + u32 length; /* In */ + u32 flags; /* In */ + u64 tmr_address; /* In */ + u32 tmr_len; /* In */ + u32 reserved; /* In */ + u64 nv_address; /* In/Out */ + u32 nv_len; /* In */ +} __packed; + #define SEV_INIT_FLAGS_SEV_ES 0x01 /** -- cgit v1.2.3 From 9dfa5b6f5efb85efe69fd3b7b0b912004d9547f1 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Thu, 16 Dec 2021 09:17:03 +0800 Subject: iommu/vt-d: Remove unused macros These macros has no reference in the tree anymore. Cleanup them. Signed-off-by: Lu Baolu Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211216011703.763331-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/intel-svm.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index 57cceecbe37f..1b73bab7eeff 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -8,12 +8,6 @@ #ifndef __INTEL_SVM_H__ #define __INTEL_SVM_H__ -/* Values for rxwp in fault_cb callback */ -#define SVM_REQ_READ (1<<3) -#define SVM_REQ_WRITE (1<<2) -#define SVM_REQ_EXEC (1<<1) -#define SVM_REQ_PRIV (1<<0) - /* Page Request Queue depth */ #define PRQ_ORDER 2 #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) -- cgit v1.2.3 From b62e3317b68d9c84301940ca8ca9c35a584111b2 Mon Sep 17 00:00:00 2001 From: Xiang wangx Date: Thu, 16 Dec 2021 23:19:16 +0800 Subject: net: fix typo in a comment The double 'as' in a comment is repeated, thus it should be removed. Signed-off-by: Xiang wangx Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index be5cb3360b94..6aadcc0ecb5b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1937,7 +1937,7 @@ enum netdev_ml_priv_type { * @udp_tunnel_nic: UDP tunnel offload state * @xdp_state: stores info on attached XDP BPF programs * - * @nested_level: Used as as a parameter of spin_lock_nested() of + * @nested_level: Used as a parameter of spin_lock_nested() of * dev->addr_list_lock. * @unlink_list: As netif_addr_lock() can be called recursively, * keep a list of interfaces to be deleted. -- cgit v1.2.3 From dd61b29207ca4f346fbd9c06bc49f093e3369185 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 7 Dec 2021 10:34:06 +0100 Subject: gpiolib: provide gpiod_remove_hogs() Currently all users of gpiod_add_hogs() call it only once at system init so there never was any need for a mechanism allowing to remove them. Now the upcoming gpio-sim will need to tear down chips with hogged lines so provide a function that allows to remove hogs. Signed-off-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij --- include/linux/gpio/machine.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/machine.h b/include/linux/gpio/machine.h index d755e529c1e3..2647dd10b541 100644 --- a/include/linux/gpio/machine.h +++ b/include/linux/gpio/machine.h @@ -100,6 +100,7 @@ void gpiod_add_lookup_table(struct gpiod_lookup_table *table); void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n); void gpiod_remove_lookup_table(struct gpiod_lookup_table *table); void gpiod_add_hogs(struct gpiod_hog *hogs); +void gpiod_remove_hogs(struct gpiod_hog *hogs); #else /* ! CONFIG_GPIOLIB */ static inline void gpiod_add_lookup_table(struct gpiod_lookup_table *table) {} @@ -108,6 +109,7 @@ void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n) {} static inline void gpiod_remove_lookup_table(struct gpiod_lookup_table *table) {} static inline void gpiod_add_hogs(struct gpiod_hog *hogs) {} +static inline void gpiod_remove_hogs(struct gpiod_hog *hogs) {} #endif /* CONFIG_GPIOLIB */ #endif /* __LINUX_GPIO_MACHINE_H */ -- cgit v1.2.3 From 990f6756bb64756d2d1033118cded6333b43397d Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 13 Dec 2021 11:16:41 +0100 Subject: gpiolib: allow to specify the firmware node in struct gpio_chip Software nodes allow us to represent hierarchies for device components that don't have their struct device representation yet - for instance: banks of GPIOs under a common GPIO expander. The core gpiolib core however doesn't offer any way of passing this information from the drivers. This extends struct gpio_chip with a pointer to fwnode that can be set by the driver and used to pass device properties for child nodes. This is similar to how we handle device-tree sub-nodes with CONFIG_OF_GPIO enabled. Signed-off-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij --- include/linux/gpio/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index a673a359e20b..b0728c8ad90c 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -289,6 +289,7 @@ struct gpio_irq_chip { * number or the name of the SoC IP-block implementing it. * @gpiodev: the internal state holder, opaque struct * @parent: optional parent device providing the GPIOs + * @fwnode: optional fwnode providing this controller's properties * @owner: helps prevent removal of modules exporting active GPIOs * @request: optional hook for chip-specific activation, such as * enabling module power and clock; may sleep @@ -377,6 +378,7 @@ struct gpio_chip { const char *label; struct gpio_device *gpiodev; struct device *parent; + struct fwnode_handle *fwnode; struct module *owner; int (*request)(struct gpio_chip *gc, -- cgit v1.2.3 From c06ef740d401d0f4ab188882bf6f8d9cf0f75eaf Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Tue, 7 Dec 2021 00:20:59 +0000 Subject: PM: core: Redefine pm_ptr() macro The pm_ptr() macro was previously conditionally defined, according to the value of the CONFIG_PM option. This meant that the pointed structure was either referenced (if CONFIG_PM was set), or never referenced (if CONFIG_PM was not set), causing it to be detected as unused by the compiler. This worked fine, but required the __maybe_unused compiler attribute to be used to every symbol pointed to by a pointer wrapped with pm_ptr(). We can do better. With this change, the pm_ptr() is now defined the same, independently of the value of CONFIG_PM. It now uses the (?:) ternary operator to conditionally resolve to its argument. Since the condition is known at compile time, the compiler will then choose to discard the unused symbols, which won't need to be tagged with __maybe_unused anymore. This pm_ptr() macro is usually used with pointers to dev_pm_ops structures created with SIMPLE_DEV_PM_OPS() or similar macros. These do use a __maybe_unused flag, which is now useless with this change, so it later can be removed. However in the meantime it causes no harm, and all the drivers still compile fine with the new pm_ptr() macro. Signed-off-by: Paul Cercueil Reviewed-by: Jonathan Cameron Reviewed-by: Arnd Bergmann Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index 1d8209c09686..b88ac7dcf2a2 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -373,11 +373,7 @@ const struct dev_pm_ops __maybe_unused name = { \ SET_RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ } -#ifdef CONFIG_PM -#define pm_ptr(_ptr) (_ptr) -#else -#define pm_ptr(_ptr) NULL -#endif +#define pm_ptr(_ptr) PTR_IF(IS_ENABLED(CONFIG_PM), (_ptr)) /* * PM_EVENT_ messages -- cgit v1.2.3 From 1a3c7bb088266fa2db017be299f91f1c1894c857 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Tue, 7 Dec 2021 00:21:00 +0000 Subject: PM: core: Add new *_PM_OPS macros, deprecate old ones This commit introduces the following macros: SYSTEM_SLEEP_PM_OPS() LATE_SYSTEM_SLEEP_PM_OPS() NOIRQ_SYSTEM_SLEEP_PM_OPS() RUNTIME_PM_OPS() These new macros are very similar to their SET_*_PM_OPS() equivalent. They however differ in the fact that the callbacks they set will always be seen as referenced by the compiler. This means that the callback functions don't need to be wrapped with a #ifdef CONFIG_PM guard, or tagged with __maybe_unused, to prevent the compiler from complaining about unused static symbols. The compiler will then simply evaluate at compile time whether or not these symbols are dead code. The callbacks that are only useful with CONFIG_PM_SLEEP is enabled, are now also wrapped with a new pm_sleep_ptr() macro, which is inspired from pm_ptr(). This is needed for drivers that use different callbacks for sleep and runtime PM, to handle the case where CONFIG_PM is set and CONFIG_PM_SLEEP is not. This commit also deprecates the following macros: SIMPLE_DEV_PM_OPS() UNIVERSAL_DEV_PM_OPS() And introduces the following macros: DEFINE_SIMPLE_DEV_PM_OPS() DEFINE_UNIVERSAL_DEV_PM_OPS() These macros are similar to the functions they were created to replace, with the following differences: - They use the new macros introduced above, and as such always reference the provided callback functions. - They are not tagged with __maybe_unused. They are meant to be used with pm_ptr() or pm_sleep_ptr() for DEFINE_UNIVERSAL_DEV_PM_OPS() and DEFINE_SIMPLE_DEV_PM_OPS() respectively. - They declare the symbol static, since every driver seems to do that anyway; and if a non-static use-case is needed an indirection pointer could be used. The point of this change, is to progressively switch from a code model where PM callbacks are all protected behind CONFIG_PM guards, to a code model where the PM callbacks are always seen by the compiler, but discarded if not used. Signed-off-by: Paul Cercueil Reviewed-by: Jonathan Cameron Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 74 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 50 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index b88ac7dcf2a2..fc9691cb01b4 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -300,47 +300,59 @@ struct dev_pm_ops { int (*runtime_idle)(struct device *dev); }; +#define SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ + .suspend = pm_sleep_ptr(suspend_fn), \ + .resume = pm_sleep_ptr(resume_fn), \ + .freeze = pm_sleep_ptr(suspend_fn), \ + .thaw = pm_sleep_ptr(resume_fn), \ + .poweroff = pm_sleep_ptr(suspend_fn), \ + .restore = pm_sleep_ptr(resume_fn), + +#define LATE_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ + .suspend_late = pm_sleep_ptr(suspend_fn), \ + .resume_early = pm_sleep_ptr(resume_fn), \ + .freeze_late = pm_sleep_ptr(suspend_fn), \ + .thaw_early = pm_sleep_ptr(resume_fn), \ + .poweroff_late = pm_sleep_ptr(suspend_fn), \ + .restore_early = pm_sleep_ptr(resume_fn), + +#define NOIRQ_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ + .suspend_noirq = pm_sleep_ptr(suspend_fn), \ + .resume_noirq = pm_sleep_ptr(resume_fn), \ + .freeze_noirq = pm_sleep_ptr(suspend_fn), \ + .thaw_noirq = pm_sleep_ptr(resume_fn), \ + .poweroff_noirq = pm_sleep_ptr(suspend_fn), \ + .restore_noirq = pm_sleep_ptr(resume_fn), + +#define RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ + .runtime_suspend = suspend_fn, \ + .runtime_resume = resume_fn, \ + .runtime_idle = idle_fn, + #ifdef CONFIG_PM_SLEEP #define SET_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ - .suspend = suspend_fn, \ - .resume = resume_fn, \ - .freeze = suspend_fn, \ - .thaw = resume_fn, \ - .poweroff = suspend_fn, \ - .restore = resume_fn, + SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) #else #define SET_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) #endif #ifdef CONFIG_PM_SLEEP #define SET_LATE_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ - .suspend_late = suspend_fn, \ - .resume_early = resume_fn, \ - .freeze_late = suspend_fn, \ - .thaw_early = resume_fn, \ - .poweroff_late = suspend_fn, \ - .restore_early = resume_fn, + LATE_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) #else #define SET_LATE_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) #endif #ifdef CONFIG_PM_SLEEP #define SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ - .suspend_noirq = suspend_fn, \ - .resume_noirq = resume_fn, \ - .freeze_noirq = suspend_fn, \ - .thaw_noirq = resume_fn, \ - .poweroff_noirq = suspend_fn, \ - .restore_noirq = resume_fn, + NOIRQ_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) #else #define SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) #endif #ifdef CONFIG_PM #define SET_RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ - .runtime_suspend = suspend_fn, \ - .runtime_resume = resume_fn, \ - .runtime_idle = idle_fn, + RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) #else #define SET_RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) #endif @@ -349,9 +361,9 @@ struct dev_pm_ops { * Use this if you want to use the same suspend and resume callbacks for suspend * to RAM and hibernation. */ -#define SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ -const struct dev_pm_ops __maybe_unused name = { \ - SET_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ +#define DEFINE_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ +static const struct dev_pm_ops name = { \ + SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ } /* @@ -367,6 +379,19 @@ const struct dev_pm_ops __maybe_unused name = { \ * .resume_early(), to the same routines as .runtime_suspend() and * .runtime_resume(), respectively (and analogously for hibernation). */ +#define DEFINE_UNIVERSAL_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ +static const struct dev_pm_ops name = { \ + SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ + RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ +} + +/* Deprecated. Use DEFINE_SIMPLE_DEV_PM_OPS() instead. */ +#define SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ +const struct dev_pm_ops __maybe_unused name = { \ + SET_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ +} + +/* Deprecated. Use DEFINE_UNIVERSAL_DEV_PM_OPS() instead. */ #define UNIVERSAL_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ const struct dev_pm_ops __maybe_unused name = { \ SET_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ @@ -374,6 +399,7 @@ const struct dev_pm_ops __maybe_unused name = { \ } #define pm_ptr(_ptr) PTR_IF(IS_ENABLED(CONFIG_PM), (_ptr)) +#define pm_sleep_ptr(_ptr) PTR_IF(IS_ENABLED(CONFIG_PM_SLEEP), (_ptr)) /* * PM_EVENT_ messages -- cgit v1.2.3 From 931da6a0de5d620425af4425344259e6ff46b654 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 7 Dec 2021 21:17:34 +0800 Subject: powercap: intel_rapl: support new layout of Psys PowerLimit Register on SPR On Sapphire Rapids, the layout of the Psys domain Power Limit Register is different from from what it was before. Enhance the code to support the new Psys PL register layout. Signed-off-by: Zhang Rui Reported-and-tested-by: Alkattan Dana [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- include/linux/intel_rapl.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index 93780834fc8f..9f4b6f5b822f 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -58,6 +58,12 @@ enum rapl_primitives { THROTTLED_TIME, PRIORITY_LEVEL, + PSYS_POWER_LIMIT1, + PSYS_POWER_LIMIT2, + PSYS_PL1_ENABLE, + PSYS_PL2_ENABLE, + PSYS_TIME_WINDOW1, + PSYS_TIME_WINDOW2, /* below are not raw primitive data */ AVERAGE_POWER, NR_RAPL_PRIMITIVES, -- cgit v1.2.3 From c24efa6732788f0be22cdf5d2aedd5e3117e983f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 7 Dec 2021 19:54:32 +0100 Subject: PM: runtime: Capture device status before disabling runtime PM In some cases (for example, during system-wide suspend and resume of devices) it is useful to know whether or not runtime PM has ever been enabled for a given device and, if so, what the runtime PM status of it had been right before runtime PM was disabled for it last time. For this reason, introduce a new struct dev_pm_info field called last_status that will be used for capturing the runtime PM status of the device when its power.disable_depth counter changes from 0 to 1. The new field will be set to RPM_INVALID to start with and whenever power.disable_depth changes from 1 to 0, so it will be valid only when runtime PM of the device is currently disabled, but it has been enabled at least once. Immediately use power.last_status in rpm_resume() to make it handle the case when PM runtime is disabled for the device, but its runtime PM status is RPM_ACTIVE more consistently. Namely, make it return 1 if power.last_status is also equal to RPM_ACTIVE in that case (the idea being that if the status was RPM_ACTIVE last time when power.disable_depth was changing from 0 to 1 and it is still RPM_ACTIVE, it can be assumed to reflect what happened to the device last time when it was using runtime PM) and -EACCES otherwise. Update the documentation to provide a description of last_status and change the description of pm_runtime_resume() in it to reflect the new behavior of rpm_active(). While at it, rearrange the code in pm_runtime_enable() to be more straightforward and replace the WARN() macro in it with a pr_warn() invocation which is less disruptive. Link: https://lore.kernel.org/linux-pm/20211026222626.39222-1-ulf.hansson@linaro.org/t/#u Reviewed-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index fc9691cb01b4..e1e9402180b9 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -521,6 +521,7 @@ const struct dev_pm_ops __maybe_unused name = { \ */ enum rpm_status { + RPM_INVALID = -1, RPM_ACTIVE = 0, RPM_RESUMING, RPM_SUSPENDED, @@ -634,6 +635,7 @@ struct dev_pm_info { unsigned int links_count; enum rpm_request request; enum rpm_status runtime_status; + enum rpm_status last_status; int runtime_error; int autosuspend_delay; u64 last_busy; -- cgit v1.2.3 From d1579e61192e0e686faa4208500ef4c3b529b16c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 10 Dec 2021 17:10:13 +0100 Subject: PM: runtime: Add safety net to supplier device release Because refcount_dec_not_one() returns true if the target refcount becomes saturated, it is generally unsafe to use its return value as a loop termination condition, but that is what happens when a device link's supplier device is released during runtime PM suspend operations and on device link removal. To address this, introduce pm_runtime_release_supplier() to be used in the above cases which will check the supplier device's runtime PM usage counter in addition to the refcount_dec_not_one() return value, so the loop can be terminated in case the rpm_active refcount value becomes invalid, and update the code in question to use it as appropriate. This change is not expected to have any visible functional impact. Reported-by: Peter Zijlstra Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman Acked-by: Peter Zijlstra (Intel) --- include/linux/pm_runtime.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index eddd66d426ca..016de5776b6d 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -58,6 +58,7 @@ extern void pm_runtime_get_suppliers(struct device *dev); extern void pm_runtime_put_suppliers(struct device *dev); extern void pm_runtime_new_link(struct device *dev); extern void pm_runtime_drop_link(struct device_link *link); +extern void pm_runtime_release_supplier(struct device_link *link, bool check_idle); extern int devm_pm_runtime_enable(struct device *dev); @@ -283,6 +284,8 @@ static inline void pm_runtime_get_suppliers(struct device *dev) {} static inline void pm_runtime_put_suppliers(struct device *dev) {} static inline void pm_runtime_new_link(struct device *dev) {} static inline void pm_runtime_drop_link(struct device_link *link) {} +static inline void pm_runtime_release_supplier(struct device_link *link, + bool check_idle) {} #endif /* !CONFIG_PM */ -- cgit v1.2.3 From 227fee5fc99eeb74d43bf68832f6d59d30ac07d8 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 16 Dec 2021 13:42:25 +0530 Subject: bus: mhi: core: Add an API for auto queueing buffers for DL channel Add a new API "mhi_prepare_for_transfer_autoqueue" for using with client drivers like QRTR to request MHI core to autoqueue buffers for the DL channel along with starting both UL and DL channels. So far, the "auto_queue" flag specified by the controller drivers in channel definition served this purpose but this will be removed at some point in future. Cc: netdev@vger.kernel.org Cc: Jakub Kicinski Cc: David S. Miller Cc: Greg Kroah-Hartman Co-developed-by: Loic Poulain Acked-by: Jakub Kicinski Signed-off-by: Loic Poulain Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20211216081227.237749-9-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mhi.h | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mhi.h b/include/linux/mhi.h index a5cc4cdf9cc8..a5441ad33c74 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -730,15 +730,26 @@ void mhi_device_put(struct mhi_device *mhi_dev); /** * mhi_prepare_for_transfer - Setup UL and DL channels for data transfer. - * Allocate and initialize the channel context and - * also issue the START channel command to both - * channels. Channels can be started only if both - * host and device execution environments match and - * channels are in a DISABLED state. * @mhi_dev: Device associated with the channels + * + * Allocate and initialize the channel context and also issue the START channel + * command to both channels. Channels can be started only if both host and + * device execution environments match and channels are in a DISABLED state. */ int mhi_prepare_for_transfer(struct mhi_device *mhi_dev); +/** + * mhi_prepare_for_transfer_autoqueue - Setup UL and DL channels with auto queue + * buffers for DL traffic + * @mhi_dev: Device associated with the channels + * + * Allocate and initialize the channel context and also issue the START channel + * command to both channels. Channels can be started only if both host and + * device execution environments match and channels are in a DISABLED state. + * The MHI core will automatically allocate and queue buffers for the DL traffic. + */ +int mhi_prepare_for_transfer_autoqueue(struct mhi_device *mhi_dev); + /** * mhi_unprepare_from_transfer - Reset UL and DL channels for data transfer. * Issue the RESET channel command and let the -- cgit v1.2.3 From 49f39cb0ef198ae3c73765c9b9ee3034e4c9f076 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 1 Dec 2021 14:59:30 +0200 Subject: device property: Fix documentation for FWNODE_GRAPH_DEVICE_DISABLED FWNODE_GRAPH_DEVICE_DISABLED flag was meant for also returning endpoints connected to disabled devices, but it also may return endpoints that are not connected. Fix this in documentation. Also fwnode_graph_get_endpoint_by_id() was affeced by this. Also improve the language a little bit. Fixes: 0fcc2bdc8aff ("device property: Add fwnode_graph_get_endpoint_by_id()") Signed-off-by: Sakari Ailus Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- include/linux/property.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 16f736c698a2..7a2df45ec3ae 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -414,7 +414,8 @@ static inline bool fwnode_graph_is_endpoint(struct fwnode_handle *fwnode) * one. * @FWNODE_GRAPH_DEVICE_DISABLED: That the device to which the remote * endpoint of the given endpoint belongs to, - * may be disabled. + * may be disabled, or that the endpoint is not + * connected. */ #define FWNODE_GRAPH_ENDPOINT_NEXT BIT(0) #define FWNODE_GRAPH_DEVICE_DISABLED BIT(1) -- cgit v1.2.3 From c87b8fc569667610b4891cad1e4a663e5a94d8f8 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 1 Dec 2021 14:59:33 +0200 Subject: device property: Implement fwnode_graph_get_endpoint_count() Add fwnode_graph_get_endpoint_count() function to provide generic implementation of of_graph_get_endpoint_count(). The former by default only counts endpoints to available devices which is consistent with the rest of the fwnode graph API. By providing FWNODE_GRAPH_DEVICE_DISABLED flag, also unconnected endpoints and endpoints to disabled devices are counted. Signed-off-by: Sakari Ailus Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- include/linux/property.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 7a2df45ec3ae..8c0104871252 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -423,6 +423,8 @@ static inline bool fwnode_graph_is_endpoint(struct fwnode_handle *fwnode) struct fwnode_handle * fwnode_graph_get_endpoint_by_id(const struct fwnode_handle *fwnode, u32 port, u32 endpoint, unsigned long flags); +unsigned int fwnode_graph_get_endpoint_count(struct fwnode_handle *fwnode, + unsigned long flags); #define fwnode_graph_for_each_endpoint(fwnode, child) \ for (child = NULL; \ -- cgit v1.2.3 From c49eea6ffec626c059ace085fce1bf501b05dbc7 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 1 Dec 2021 15:01:15 +0200 Subject: device property: Drop fwnode_graph_get_remote_node() fwnode_graph_get_remote_node() is only used by the tegra-video driver. Convert it to use newer fwnode_graph_get_endpoint_by_id() and drop now-unused fwnode_graph_get_remote_node(). Signed-off-by: Sakari Ailus Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- include/linux/property.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 8c0104871252..8355f99ebd47 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -397,9 +397,6 @@ struct fwnode_handle *fwnode_graph_get_remote_port( const struct fwnode_handle *fwnode); struct fwnode_handle *fwnode_graph_get_remote_endpoint( const struct fwnode_handle *fwnode); -struct fwnode_handle * -fwnode_graph_get_remote_node(const struct fwnode_handle *fwnode, u32 port, - u32 endpoint); static inline bool fwnode_graph_is_endpoint(struct fwnode_handle *fwnode) { -- cgit v1.2.3 From bd0b536dc2e1e9828a85b1e3470ee7bafc3b36f6 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Mon, 29 Nov 2021 16:15:59 -0800 Subject: virtchnl: Add support for new VLAN capabilities Currently VIRTCHNL only allows for VLAN filtering and offloads to happen on a single 802.1Q VLAN. Add support to filter and offload on inner, outer, and/or inner + outer VLANs. This is done by introducing the new capability VIRTCHNL_VF_OFFLOAD_VLAN_V2. The flow to negotiate this new capability is shown below. 1. VF - sets the VIRTCHNL_VF_OFFLOAD_VLAN_V2 bit in the virtchnl_vf_resource.vf_caps_flags during the VIRTCHNL_OP_GET_VF_RESOURCES request message. The VF should also set the VIRTCHNL_VF_OFFLOAD_VLAN bit in case the PF driver doesn't support the new capability. 2. PF - sets the VLAN capability bit it supports in the VIRTCHNL_OP_GET_VF_RESOURCES response message. This will either be VIRTCHNL_VF_OFFLOAD_VLAN_V2, VIRTCHNL_VF_OFFLOAD_VLAN, or none. 3. VF - If the VIRTCHNL_VF_OFFLOAD_VLAN_V2 capability was ACK'd by the PF, then the VF needs to request the VLAN capabilities of the PF/Device by issuing a VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS request. If the VIRTCHNL_VF_OFFLOAD_VLAN capability was ACK'd then the VF knows only single 802.1Q VLAN filtering/offloads are supported. If no VLAN capability is ACK'd then the PF/Device doesn't support hardware VLAN filtering/offloads for this VF. 4. PF - Populates the virtchnl_vlan_caps structure based on what it allows/supports for that VF and sends that response via VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS. After VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS is successfully negotiated the VF driver needs to interpret the capabilities supported by the underlying PF/Device. The VF will be allowed to filter/offload the inner 802.1Q, outer (various ethertype), inner 802.1Q + outer (various ethertypes), or none based on which fields are set. The VF will also need to interpret where the VLAN tag should be inserted and/or stripped based on the negotiated capabilities. Signed-off-by: Brett Creeley Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- include/linux/avf/virtchnl.h | 377 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 377 insertions(+) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index b30a1bc74fc7..2ce27e8e4f19 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -141,6 +141,13 @@ enum virtchnl_ops { VIRTCHNL_OP_DEL_RSS_CFG = 46, VIRTCHNL_OP_ADD_FDIR_FILTER = 47, VIRTCHNL_OP_DEL_FDIR_FILTER = 48, + VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS = 51, + VIRTCHNL_OP_ADD_VLAN_V2 = 52, + VIRTCHNL_OP_DEL_VLAN_V2 = 53, + VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 = 54, + VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2 = 55, + VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2 = 56, + VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2 = 57, VIRTCHNL_OP_MAX, }; @@ -246,6 +253,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource); #define VIRTCHNL_VF_OFFLOAD_REQ_QUEUES BIT(6) /* used to negotiate communicating link speeds in Mbps */ #define VIRTCHNL_VF_CAP_ADV_LINK_SPEED BIT(7) +#define VIRTCHNL_VF_OFFLOAD_VLAN_V2 BIT(15) #define VIRTCHNL_VF_OFFLOAD_VLAN BIT(16) #define VIRTCHNL_VF_OFFLOAD_RX_POLLING BIT(17) #define VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2 BIT(18) @@ -475,6 +483,351 @@ struct virtchnl_vlan_filter_list { VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_vlan_filter_list); +/* This enum is used for all of the VIRTCHNL_VF_OFFLOAD_VLAN_V2_CAPS related + * structures and opcodes. + * + * VIRTCHNL_VLAN_UNSUPPORTED - This field is not supported and if a VF driver + * populates it the PF should return VIRTCHNL_STATUS_ERR_NOT_SUPPORTED. + * + * VIRTCHNL_VLAN_ETHERTYPE_8100 - This field supports 0x8100 ethertype. + * VIRTCHNL_VLAN_ETHERTYPE_88A8 - This field supports 0x88A8 ethertype. + * VIRTCHNL_VLAN_ETHERTYPE_9100 - This field supports 0x9100 ethertype. + * + * VIRTCHNL_VLAN_ETHERTYPE_AND - Used when multiple ethertypes can be supported + * by the PF concurrently. For example, if the PF can support + * VIRTCHNL_VLAN_ETHERTYPE_8100 AND VIRTCHNL_VLAN_ETHERTYPE_88A8 filters it + * would OR the following bits: + * + * VIRTHCNL_VLAN_ETHERTYPE_8100 | + * VIRTCHNL_VLAN_ETHERTYPE_88A8 | + * VIRTCHNL_VLAN_ETHERTYPE_AND; + * + * The VF would interpret this as VLAN filtering can be supported on both 0x8100 + * and 0x88A8 VLAN ethertypes. + * + * VIRTCHNL_ETHERTYPE_XOR - Used when only a single ethertype can be supported + * by the PF concurrently. For example if the PF can support + * VIRTCHNL_VLAN_ETHERTYPE_8100 XOR VIRTCHNL_VLAN_ETHERTYPE_88A8 stripping + * offload it would OR the following bits: + * + * VIRTCHNL_VLAN_ETHERTYPE_8100 | + * VIRTCHNL_VLAN_ETHERTYPE_88A8 | + * VIRTCHNL_VLAN_ETHERTYPE_XOR; + * + * The VF would interpret this as VLAN stripping can be supported on either + * 0x8100 or 0x88a8 VLAN ethertypes. So when requesting VLAN stripping via + * VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 the specified ethertype will override + * the previously set value. + * + * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1 - Used to tell the VF to insert and/or + * strip the VLAN tag using the L2TAG1 field of the Tx/Rx descriptors. + * + * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2 - Used to tell the VF to insert hardware + * offloaded VLAN tags using the L2TAG2 field of the Tx descriptor. + * + * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2 - Used to tell the VF to strip hardware + * offloaded VLAN tags using the L2TAG2_2 field of the Rx descriptor. + * + * VIRTCHNL_VLAN_PRIO - This field supports VLAN priority bits. This is used for + * VLAN filtering if the underlying PF supports it. + * + * VIRTCHNL_VLAN_TOGGLE_ALLOWED - This field is used to say whether a + * certain VLAN capability can be toggled. For example if the underlying PF/CP + * allows the VF to toggle VLAN filtering, stripping, and/or insertion it should + * set this bit along with the supported ethertypes. + */ +enum virtchnl_vlan_support { + VIRTCHNL_VLAN_UNSUPPORTED = 0, + VIRTCHNL_VLAN_ETHERTYPE_8100 = BIT(0), + VIRTCHNL_VLAN_ETHERTYPE_88A8 = BIT(1), + VIRTCHNL_VLAN_ETHERTYPE_9100 = BIT(2), + VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1 = BIT(8), + VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2 = BIT(9), + VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2 = BIT(10), + VIRTCHNL_VLAN_PRIO = BIT(24), + VIRTCHNL_VLAN_FILTER_MASK = BIT(28), + VIRTCHNL_VLAN_ETHERTYPE_AND = BIT(29), + VIRTCHNL_VLAN_ETHERTYPE_XOR = BIT(30), + VIRTCHNL_VLAN_TOGGLE = BIT(31), +}; + +/* This structure is used as part of the VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS + * for filtering, insertion, and stripping capabilities. + * + * If only outer capabilities are supported (for filtering, insertion, and/or + * stripping) then this refers to the outer most or single VLAN from the VF's + * perspective. + * + * If only inner capabilities are supported (for filtering, insertion, and/or + * stripping) then this refers to the outer most or single VLAN from the VF's + * perspective. Functionally this is the same as if only outer capabilities are + * supported. The VF driver is just forced to use the inner fields when + * adding/deleting filters and enabling/disabling offloads (if supported). + * + * If both outer and inner capabilities are supported (for filtering, insertion, + * and/or stripping) then outer refers to the outer most or single VLAN and + * inner refers to the second VLAN, if it exists, in the packet. + * + * There is no support for tunneled VLAN offloads, so outer or inner are never + * referring to a tunneled packet from the VF's perspective. + */ +struct virtchnl_vlan_supported_caps { + u32 outer; + u32 inner; +}; + +/* The PF populates these fields based on the supported VLAN filtering. If a + * field is VIRTCHNL_VLAN_UNSUPPORTED then it's not supported and the PF will + * reject any VIRTCHNL_OP_ADD_VLAN_V2 or VIRTCHNL_OP_DEL_VLAN_V2 messages using + * the unsupported fields. + * + * Also, a VF is only allowed to toggle its VLAN filtering setting if the + * VIRTCHNL_VLAN_TOGGLE bit is set. + * + * The ethertype(s) specified in the ethertype_init field are the ethertypes + * enabled for VLAN filtering. VLAN filtering in this case refers to the outer + * most VLAN from the VF's perspective. If both inner and outer filtering are + * allowed then ethertype_init only refers to the outer most VLAN as only + * VLAN ethertype supported for inner VLAN filtering is + * VIRTCHNL_VLAN_ETHERTYPE_8100. By default, inner VLAN filtering is disabled + * when both inner and outer filtering are allowed. + * + * The max_filters field tells the VF how many VLAN filters it's allowed to have + * at any one time. If it exceeds this amount and tries to add another filter, + * then the request will be rejected by the PF. To prevent failures, the VF + * should keep track of how many VLAN filters it has added and not attempt to + * add more than max_filters. + */ +struct virtchnl_vlan_filtering_caps { + struct virtchnl_vlan_supported_caps filtering_support; + u32 ethertype_init; + u16 max_filters; + u8 pad[2]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vlan_filtering_caps); + +/* This enum is used for the virtchnl_vlan_offload_caps structure to specify + * if the PF supports a different ethertype for stripping and insertion. + * + * VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION - The ethertype(s) specified + * for stripping affect the ethertype(s) specified for insertion and visa versa + * as well. If the VF tries to configure VLAN stripping via + * VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 with VIRTCHNL_VLAN_ETHERTYPE_8100 then + * that will be the ethertype for both stripping and insertion. + * + * VIRTCHNL_ETHERTYPE_MATCH_NOT_REQUIRED - The ethertype(s) specified for + * stripping do not affect the ethertype(s) specified for insertion and visa + * versa. + */ +enum virtchnl_vlan_ethertype_match { + VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION = 0, + VIRTCHNL_ETHERTYPE_MATCH_NOT_REQUIRED = 1, +}; + +/* The PF populates these fields based on the supported VLAN offloads. If a + * field is VIRTCHNL_VLAN_UNSUPPORTED then it's not supported and the PF will + * reject any VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 or + * VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2 messages using the unsupported fields. + * + * Also, a VF is only allowed to toggle its VLAN offload setting if the + * VIRTCHNL_VLAN_TOGGLE_ALLOWED bit is set. + * + * The VF driver needs to be aware of how the tags are stripped by hardware and + * inserted by the VF driver based on the level of offload support. The PF will + * populate these fields based on where the VLAN tags are expected to be + * offloaded via the VIRTHCNL_VLAN_TAG_LOCATION_* bits. The VF will need to + * interpret these fields. See the definition of the + * VIRTCHNL_VLAN_TAG_LOCATION_* bits above the virtchnl_vlan_support + * enumeration. + */ +struct virtchnl_vlan_offload_caps { + struct virtchnl_vlan_supported_caps stripping_support; + struct virtchnl_vlan_supported_caps insertion_support; + u32 ethertype_init; + u8 ethertype_match; + u8 pad[3]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(24, virtchnl_vlan_offload_caps); + +/* VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS + * VF sends this message to determine its VLAN capabilities. + * + * PF will mark which capabilities it supports based on hardware support and + * current configuration. For example, if a port VLAN is configured the PF will + * not allow outer VLAN filtering, stripping, or insertion to be configured so + * it will block these features from the VF. + * + * The VF will need to cross reference its capabilities with the PFs + * capabilities in the response message from the PF to determine the VLAN + * support. + */ +struct virtchnl_vlan_caps { + struct virtchnl_vlan_filtering_caps filtering; + struct virtchnl_vlan_offload_caps offloads; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(40, virtchnl_vlan_caps); + +struct virtchnl_vlan { + u16 tci; /* tci[15:13] = PCP and tci[11:0] = VID */ + u16 tci_mask; /* only valid if VIRTCHNL_VLAN_FILTER_MASK set in + * filtering caps + */ + u16 tpid; /* 0x8100, 0x88a8, etc. and only type(s) set in + * filtering caps. Note that tpid here does not refer to + * VIRTCHNL_VLAN_ETHERTYPE_*, but it refers to the + * actual 2-byte VLAN TPID + */ + u8 pad[2]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_vlan); + +struct virtchnl_vlan_filter { + struct virtchnl_vlan inner; + struct virtchnl_vlan outer; + u8 pad[16]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(32, virtchnl_vlan_filter); + +/* VIRTCHNL_OP_ADD_VLAN_V2 + * VIRTCHNL_OP_DEL_VLAN_V2 + * + * VF sends these messages to add/del one or more VLAN tag filters for Rx + * traffic. + * + * The PF attempts to add the filters and returns status. + * + * The VF should only ever attempt to add/del virtchnl_vlan_filter(s) using the + * supported fields negotiated via VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS. + */ +struct virtchnl_vlan_filter_list_v2 { + u16 vport_id; + u16 num_elements; + u8 pad[4]; + struct virtchnl_vlan_filter filters[1]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(40, virtchnl_vlan_filter_list_v2); + +/* VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 + * VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2 + * VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2 + * VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2 + * + * VF sends this message to enable or disable VLAN stripping or insertion. It + * also needs to specify an ethertype. The VF knows which VLAN ethertypes are + * allowed and whether or not it's allowed to enable/disable the specific + * offload via the VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS message. The VF needs to + * parse the virtchnl_vlan_caps.offloads fields to determine which offload + * messages are allowed. + * + * For example, if the PF populates the virtchnl_vlan_caps.offloads in the + * following manner the VF will be allowed to enable and/or disable 0x8100 inner + * VLAN insertion and/or stripping via the opcodes listed above. Inner in this + * case means the outer most or single VLAN from the VF's perspective. This is + * because no outer offloads are supported. See the comments above the + * virtchnl_vlan_supported_caps structure for more details. + * + * virtchnl_vlan_caps.offloads.stripping_support.inner = + * VIRTCHNL_VLAN_TOGGLE | + * VIRTCHNL_VLAN_ETHERTYPE_8100; + * + * virtchnl_vlan_caps.offloads.insertion_support.inner = + * VIRTCHNL_VLAN_TOGGLE | + * VIRTCHNL_VLAN_ETHERTYPE_8100; + * + * In order to enable inner (again note that in this case inner is the outer + * most or single VLAN from the VF's perspective) VLAN stripping for 0x8100 + * VLANs, the VF would populate the virtchnl_vlan_setting structure in the + * following manner and send the VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 message. + * + * virtchnl_vlan_setting.inner_ethertype_setting = + * VIRTCHNL_VLAN_ETHERTYPE_8100; + * + * virtchnl_vlan_setting.vport_id = vport_id or vsi_id assigned to the VF on + * initialization. + * + * The reason that VLAN TPID(s) are not being used for the + * outer_ethertype_setting and inner_ethertype_setting fields is because it's + * possible a device could support VLAN insertion and/or stripping offload on + * multiple ethertypes concurrently, so this method allows a VF to request + * multiple ethertypes in one message using the virtchnl_vlan_support + * enumeration. + * + * For example, if the PF populates the virtchnl_vlan_caps.offloads in the + * following manner the VF will be allowed to enable 0x8100 and 0x88a8 outer + * VLAN insertion and stripping simultaneously. The + * virtchnl_vlan_caps.offloads.ethertype_match field will also have to be + * populated based on what the PF can support. + * + * virtchnl_vlan_caps.offloads.stripping_support.outer = + * VIRTCHNL_VLAN_TOGGLE | + * VIRTCHNL_VLAN_ETHERTYPE_8100 | + * VIRTCHNL_VLAN_ETHERTYPE_88A8 | + * VIRTCHNL_VLAN_ETHERTYPE_AND; + * + * virtchnl_vlan_caps.offloads.insertion_support.outer = + * VIRTCHNL_VLAN_TOGGLE | + * VIRTCHNL_VLAN_ETHERTYPE_8100 | + * VIRTCHNL_VLAN_ETHERTYPE_88A8 | + * VIRTCHNL_VLAN_ETHERTYPE_AND; + * + * In order to enable outer VLAN stripping for 0x8100 and 0x88a8 VLANs, the VF + * would populate the virthcnl_vlan_offload_structure in the following manner + * and send the VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 message. + * + * virtchnl_vlan_setting.outer_ethertype_setting = + * VIRTHCNL_VLAN_ETHERTYPE_8100 | + * VIRTHCNL_VLAN_ETHERTYPE_88A8; + * + * virtchnl_vlan_setting.vport_id = vport_id or vsi_id assigned to the VF on + * initialization. + * + * There is also the case where a PF and the underlying hardware can support + * VLAN offloads on multiple ethertypes, but not concurrently. For example, if + * the PF populates the virtchnl_vlan_caps.offloads in the following manner the + * VF will be allowed to enable and/or disable 0x8100 XOR 0x88a8 outer VLAN + * offloads. The ethertypes must match for stripping and insertion. + * + * virtchnl_vlan_caps.offloads.stripping_support.outer = + * VIRTCHNL_VLAN_TOGGLE | + * VIRTCHNL_VLAN_ETHERTYPE_8100 | + * VIRTCHNL_VLAN_ETHERTYPE_88A8 | + * VIRTCHNL_VLAN_ETHERTYPE_XOR; + * + * virtchnl_vlan_caps.offloads.insertion_support.outer = + * VIRTCHNL_VLAN_TOGGLE | + * VIRTCHNL_VLAN_ETHERTYPE_8100 | + * VIRTCHNL_VLAN_ETHERTYPE_88A8 | + * VIRTCHNL_VLAN_ETHERTYPE_XOR; + * + * virtchnl_vlan_caps.offloads.ethertype_match = + * VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION; + * + * In order to enable outer VLAN stripping for 0x88a8 VLANs, the VF would + * populate the virtchnl_vlan_setting structure in the following manner and send + * the VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2. Also, this will change the + * ethertype for VLAN insertion if it's enabled. So, for completeness, a + * VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2 with the same ethertype should be sent. + * + * virtchnl_vlan_setting.outer_ethertype_setting = VIRTHCNL_VLAN_ETHERTYPE_88A8; + * + * virtchnl_vlan_setting.vport_id = vport_id or vsi_id assigned to the VF on + * initialization. + */ +struct virtchnl_vlan_setting { + u32 outer_ethertype_setting; + u32 inner_ethertype_setting; + u16 vport_id; + u8 pad[6]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vlan_setting); + /* VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE * VF sends VSI id and flags. * PF returns status code in retval. @@ -1156,6 +1509,30 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, case VIRTCHNL_OP_DEL_FDIR_FILTER: valid_len = sizeof(struct virtchnl_fdir_del); break; + case VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS: + break; + case VIRTCHNL_OP_ADD_VLAN_V2: + case VIRTCHNL_OP_DEL_VLAN_V2: + valid_len = sizeof(struct virtchnl_vlan_filter_list_v2); + if (msglen >= valid_len) { + struct virtchnl_vlan_filter_list_v2 *vfl = + (struct virtchnl_vlan_filter_list_v2 *)msg; + + valid_len += (vfl->num_elements - 1) * + sizeof(struct virtchnl_vlan_filter); + + if (vfl->num_elements == 0) { + err_msg_format = true; + break; + } + } + break; + case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2: + case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2: + case VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2: + case VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2: + valid_len = sizeof(struct virtchnl_vlan_setting); + break; /* These are always errors coming from the VF. */ case VIRTCHNL_OP_EVENT: case VIRTCHNL_OP_UNKNOWN: -- cgit v1.2.3 From 877fee2a0c65a3b0b6ac0e90d7d7718b5a0341d3 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 17 Dec 2021 15:15:15 +0100 Subject: PCI: Convert pci_dev_present() stub to static inline Change the pci_dev_present() stub which is used when CONFIG_PCI is not set from a #define to a static inline stub. Thix should fix clang -Werror builds failing due to errors like this: drivers/platform/x86/thinkpad_acpi.c:4475:35: error: unused variable 'fwbug_cards_ids' [-Werror,-Wunused-const-variable] Where fwbug_cards_ids is an array of pci_device_id passed to pci_dev_present() during a quirk check. Link: https://lore.kernel.org/r/20211217141515.379586-1-hdegoede@redhat.com Reported-by: kernel test robot Signed-off-by: Hans de Goede Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko Cc: platform-driver-x86@vger.kernel.org --- include/linux/pci.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 18a75c8e615c..7d825637d7ca 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1775,7 +1775,10 @@ static inline struct pci_dev *pci_get_class(unsigned int class, struct pci_dev *from) { return NULL; } -#define pci_dev_present(ids) (0) + +static inline int pci_dev_present(const struct pci_device_id *ids) +{ return 0; } + #define no_pci_devices() (1) #define pci_dev_put(dev) do { } while (0) -- cgit v1.2.3 From 3849595866166b23bf6a0cb9ff87e06423167f67 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Tue, 14 Dec 2021 19:24:34 +0200 Subject: net/sched: flow_dissector: Fix matching on zone id for invalid conns If ct rejects a flow, it removes the conntrack info from the skb. act_ct sets the post_ct variable so the dissector will see this case as an +tracked +invalid state, but the zone id is lost with the conntrack info. To restore the zone id on such cases, set the last executed zone, via the tc control block, when passing ct, and read it back in the dissector if there is no ct info on the skb (invalid connection). Fixes: 7baf2429a1a9 ("net/sched: cls_flower add CT_FLAGS_INVALID flag support") Signed-off-by: Paul Blakey Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c8cb7e697d47..2ecf8cfd2223 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1380,7 +1380,7 @@ skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, u16 *ctinfo_map, size_t mapsize, - bool post_ct); + bool post_ct, u16 zone); void skb_flow_dissect_tunnel_info(const struct sk_buff *skb, struct flow_dissector *flow_dissector, -- cgit v1.2.3 From 635d448a1cce4b4ebee52b351052c70434fa90ea Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Tue, 14 Dec 2021 19:24:35 +0200 Subject: net: openvswitch: Fix matching zone id for invalid conns arriving from tc Zone id is not restored if we passed ct and ct rejected the connection, as there is no ct info on the skb. Save the zone from tc skb cb to tc skb extension and pass it on to ovs, use that info to restore the zone id for invalid connections. Fixes: d29334c15d33 ("net/sched: act_api: fix miss set post_ct for ovs after do conntrack in act_ct") Signed-off-by: Paul Blakey Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2ecf8cfd2223..4507d77d6941 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -286,6 +286,7 @@ struct nf_bridge_info { struct tc_skb_ext { __u32 chain; __u16 mru; + __u16 zone; bool post_ct; }; #endif -- cgit v1.2.3 From 6e478521df535b9d5ef5eb84d4352f235bbbef99 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 30 Jul 2021 09:56:05 -0400 Subject: iomap,xfs: Convert ->discard_page to ->discard_folio XFS has the only implementation of ->discard_page today, so convert it to use folios in the same patch as converting the API. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 29491fb9c5ba..5ef5088dbbd8 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -285,7 +285,7 @@ struct iomap_writeback_ops { * Optional, allows the file system to discard state on a page where * we failed to submit any I/O. */ - void (*discard_page)(struct page *page, loff_t fileoff); + void (*discard_folio)(struct folio *folio, loff_t pos); }; struct iomap_writepage_ctx { -- cgit v1.2.3 From e17f7a0bc4daa44a4809f5f2f947aa2aa74d1369 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Dec 2021 09:45:05 +0100 Subject: uio: remove copy_from_iter_flushcache() and copy_mc_to_iter() These two wrappers are never used. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211215084508.435401-2-hch@lst.de Signed-off-by: Dan Williams --- include/linux/uio.h | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 6350354f97e9..494d552c1d66 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -196,7 +196,7 @@ bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /* * Note, users like pmem that depend on the stricter semantics of - * copy_from_iter_flushcache() than copy_from_iter_nocache() must check for + * _copy_from_iter_flushcache() than _copy_from_iter_nocache() must check for * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the * destination is flushed from the cache on return. */ @@ -211,24 +211,6 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i); #define _copy_mc_to_iter _copy_to_iter #endif -static __always_inline __must_check -size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) -{ - if (unlikely(!check_copy_size(addr, bytes, false))) - return 0; - else - return _copy_from_iter_flushcache(addr, bytes, i); -} - -static __always_inline __must_check -size_t copy_mc_to_iter(void *addr, size_t bytes, struct iov_iter *i) -{ - if (unlikely(!check_copy_size(addr, bytes, true))) - return 0; - else - return _copy_mc_to_iter(addr, bytes, i); -} - size_t iov_iter_zero(size_t bytes, struct iov_iter *); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); -- cgit v1.2.3 From fd1d00ec92002d8fe28ca981a72395eaa7ae3d11 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Dec 2021 09:45:06 +0100 Subject: dax: simplify dax_synchronous and set_dax_synchronous Remove the pointless wrappers. Signed-off-by: Christoph Hellwig Reviewed-by: Pankaj Gupta Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20211215084508.435401-3-hch@lst.de Signed-off-by: Dan Williams --- include/linux/dax.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 87ae4c9b1d65..3bd1fdb5d5f4 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -48,16 +48,8 @@ void put_dax(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); void dax_write_cache(struct dax_device *dax_dev, bool wc); bool dax_write_cache_enabled(struct dax_device *dax_dev); -bool __dax_synchronous(struct dax_device *dax_dev); -static inline bool dax_synchronous(struct dax_device *dax_dev) -{ - return __dax_synchronous(dax_dev); -} -void __set_dax_synchronous(struct dax_device *dax_dev); -static inline void set_dax_synchronous(struct dax_device *dax_dev) -{ - __set_dax_synchronous(dax_dev); -} +bool dax_synchronous(struct dax_device *dax_dev); +void set_dax_synchronous(struct dax_device *dax_dev); /* * Check if given mapping is supported by the file / underlying device. */ -- cgit v1.2.3 From 30c6828a17a572aeb9e3a3bacce05fdcf1106541 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Dec 2021 09:45:07 +0100 Subject: dax: remove the DAXDEV_F_SYNC flag Remove the DAXDEV_F_SYNC flag and thus the flags argument to alloc_dax and just let the drivers call set_dax_synchronous directly. Signed-off-by: Christoph Hellwig Reviewed-by: Pankaj Gupta Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20211215084508.435401-4-hch@lst.de Signed-off-by: Dan Williams --- include/linux/dax.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 3bd1fdb5d5f4..c04f46478e3b 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -6,9 +6,6 @@ #include #include -/* Flag for synchronous flush */ -#define DAXDEV_F_SYNC (1UL << 0) - typedef unsigned long dax_entry_t; struct dax_device; @@ -42,8 +39,7 @@ struct dax_operations { }; #if IS_ENABLED(CONFIG_DAX) -struct dax_device *alloc_dax(void *private, const struct dax_operations *ops, - unsigned long flags); +struct dax_device *alloc_dax(void *private, const struct dax_operations *ops); void put_dax(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); void dax_write_cache(struct dax_device *dax_dev, bool wc); @@ -64,7 +60,7 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, } #else static inline struct dax_device *alloc_dax(void *private, - const struct dax_operations *ops, unsigned long flags) + const struct dax_operations *ops) { /* * Callers should check IS_ENABLED(CONFIG_DAX) to know if this -- cgit v1.2.3 From 7ac5360cd4d02cc7e0eaf10867f599e041822f12 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Dec 2021 09:45:08 +0100 Subject: dax: remove the copy_from_iter and copy_to_iter methods These methods indirect the actual DAX read/write path. In the end pmem uses magic flush and mc safe variants and fuse and dcssblk use plain ones while device mapper picks redirects to the underlying device. Add set_dax_nocache() and set_dax_nomc() APIs to control which copy routines are used to remove indirect call from the read/write fast path as well as a lot of boilerplate code. Signed-off-by: Christoph Hellwig Reviewed-by: Vivek Goyal [virtiofs] Link: https://lore.kernel.org/r/20211215084508.435401-5-hch@lst.de Signed-off-by: Dan Williams --- include/linux/dax.h | 9 +++------ include/linux/device-mapper.h | 4 ---- 2 files changed, 3 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index c04f46478e3b..9fc5f99a0ae2 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -28,12 +28,6 @@ struct dax_operations { */ bool (*dax_supported)(struct dax_device *, struct block_device *, int, sector_t, sector_t); - /* copy_from_iter: required operation for fs-dax direct-i/o */ - size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t, - struct iov_iter *); - /* copy_to_iter: required operation for fs-dax direct-i/o */ - size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t, - struct iov_iter *); /* zero_page_range: required operation. Zero page range */ int (*zero_page_range)(struct dax_device *, pgoff_t, size_t); }; @@ -95,6 +89,9 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, } #endif +void set_dax_nocache(struct dax_device *dax_dev); +void set_dax_nomc(struct dax_device *dax_dev); + struct writeback_control; #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index a7df155ea49b..b26fecf6c8e8 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -147,8 +147,6 @@ typedef int (*dm_busy_fn) (struct dm_target *ti); */ typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn); -typedef size_t (*dm_dax_copy_iter_fn)(struct dm_target *ti, pgoff_t pgoff, - void *addr, size_t bytes, struct iov_iter *i); typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff, size_t nr_pages); @@ -200,8 +198,6 @@ struct target_type { dm_iterate_devices_fn iterate_devices; dm_io_hints_fn io_hints; dm_dax_direct_access_fn direct_access; - dm_dax_copy_iter_fn dax_copy_from_iter; - dm_dax_copy_iter_fn dax_copy_to_iter; dm_dax_zero_page_range_fn dax_zero_page_range; /* For internal device-mapper use. */ -- cgit v1.2.3 From d639b9d13a39cf15639cbe6e8b2c43eb60148a73 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:44 -0800 Subject: bpf: Introduce composable reg, ret and arg types. There are some common properties shared between bpf reg, ret and arg values. For instance, a value may be a NULL pointer, or a pointer to a read-only memory. Previously, to express these properties, enumeration was used. For example, in order to test whether a reg value can be NULL, reg_type_may_be_null() simply enumerates all types that are possibly NULL. The problem of this approach is that it's not scalable and causes a lot of duplication. These properties can be combined, for example, a type could be either MAYBE_NULL or RDONLY, or both. This patch series rewrites the layout of reg_type, arg_type and ret_type, so that common properties can be extracted and represented as composable flag. For example, one can write ARG_PTR_TO_MEM | PTR_MAYBE_NULL which is equivalent to the previous ARG_PTR_TO_MEM_OR_NULL The type ARG_PTR_TO_MEM are called "base type" in this patch. Base types can be extended with flags. A flag occupies the higher bits while base types sits in the lower bits. This patch in particular sets up a set of macro for this purpose. The following patches will rewrite arg_types, ret_types and reg_types respectively. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-2-haoluo@google.com --- include/linux/bpf.h | 42 ++++++++++++++++++++++++++++++++++++++++++ include/linux/bpf_verifier.h | 13 +++++++++++++ 2 files changed, 55 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 965fffaf0308..41bb3687cc85 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -297,6 +297,29 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, extern const struct bpf_map_ops bpf_map_offload_ops; +/* bpf_type_flag contains a set of flags that are applicable to the values of + * arg_type, ret_type and reg_type. For example, a pointer value may be null, + * or a memory is read-only. We classify types into two categories: base types + * and extended types. Extended types are base types combined with a type flag. + * + * Currently there are no more than 32 base types in arg_type, ret_type and + * reg_types. + */ +#define BPF_BASE_TYPE_BITS 8 + +enum bpf_type_flag { + /* PTR may be NULL. */ + PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = PTR_MAYBE_NULL, +}; + +/* Max number of base types. */ +#define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) + +/* Max number of all types. */ +#define BPF_TYPE_LIMIT (__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1)) + /* function argument constraints */ enum bpf_arg_type { ARG_DONTCARE = 0, /* unused argument in helper function */ @@ -343,7 +366,13 @@ enum bpf_arg_type { ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ __BPF_ARG_TYPE_MAX, + + /* This must be the last entry. Its purpose is to ensure the enum is + * wide enough to hold the higher bits reserved for bpf_type_flag. + */ + __BPF_ARG_TYPE_LIMIT = BPF_TYPE_LIMIT, }; +static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* type of values returned from helper functions */ enum bpf_return_type { @@ -359,7 +388,14 @@ enum bpf_return_type { RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ + __BPF_RET_TYPE_MAX, + + /* This must be the last entry. Its purpose is to ensure the enum is + * wide enough to hold the higher bits reserved for bpf_type_flag. + */ + __BPF_RET_TYPE_LIMIT = BPF_TYPE_LIMIT, }; +static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL @@ -461,7 +497,13 @@ enum bpf_reg_type { PTR_TO_FUNC, /* reg points to a bpf program function */ PTR_TO_MAP_KEY, /* reg points to a map element key */ __BPF_REG_TYPE_MAX, + + /* This must be the last entry. Its purpose is to ensure the enum is + * wide enough to hold the higher bits reserved for bpf_type_flag. + */ + __BPF_REG_TYPE_LIMIT = BPF_TYPE_LIMIT, }; +static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* The information passed from prog-specific *_is_valid_access * back to the verifier. diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index ee931398f311..34e4ceaca3c7 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -546,5 +546,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, struct bpf_attach_target_info *tgt_info); void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab); +#define BPF_BASE_TYPE_MASK GENMASK(BPF_BASE_TYPE_BITS - 1, 0) + +/* extract base type from bpf_{arg, return, reg}_type. */ +static inline u32 base_type(u32 type) +{ + return type & BPF_BASE_TYPE_MASK; +} + +/* extract flags from an extended type. See bpf_type_flag in bpf.h. */ +static inline u32 type_flag(u32 type) +{ + return type & ~BPF_BASE_TYPE_MASK; +} #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit v1.2.3 From 48946bd6a5d695c50b34546864b79c1f910a33c1 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:45 -0800 Subject: bpf: Replace ARG_XXX_OR_NULL with ARG_XXX | PTR_MAYBE_NULL We have introduced a new type to make bpf_arg composable, by reserving high bits of bpf_arg to represent flags of a type. One of the flags is PTR_MAYBE_NULL which indicates a pointer may be NULL. When applying this flag to an arg_type, it means the arg can take NULL pointer. This patch switches the qualified arg_types to use this flag. The arg_types changed in this patch include: 1. ARG_PTR_TO_MAP_VALUE_OR_NULL 2. ARG_PTR_TO_MEM_OR_NULL 3. ARG_PTR_TO_CTX_OR_NULL 4. ARG_PTR_TO_SOCKET_OR_NULL 5. ARG_PTR_TO_ALLOC_MEM_OR_NULL 6. ARG_PTR_TO_STACK_OR_NULL This patch does not eliminate the use of these arg_types, instead it makes them an alias to the 'ARG_XXX | PTR_MAYBE_NULL'. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-3-haoluo@google.com --- include/linux/bpf.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 41bb3687cc85..765bd7cc4272 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -331,13 +331,11 @@ enum bpf_arg_type { ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ - ARG_PTR_TO_MAP_VALUE_OR_NULL, /* pointer to stack used as map value or NULL */ /* the following constraints used to prototype bpf_memcmp() and other * functions that access data on eBPF program stack */ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ - ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */ ARG_PTR_TO_UNINIT_MEM, /* pointer to memory does not need to be initialized, * helper function must fill all bytes or clear * them in error case. @@ -347,26 +345,31 @@ enum bpf_arg_type { ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ ARG_PTR_TO_CTX, /* pointer to context */ - ARG_PTR_TO_CTX_OR_NULL, /* pointer to context or NULL */ ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ - ARG_PTR_TO_SOCKET_OR_NULL, /* pointer to bpf_sock (fullsock) or NULL */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ - ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ - ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */ + ARG_PTR_TO_STACK, /* pointer to stack */ ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ __BPF_ARG_TYPE_MAX, + /* Extended arg_types. */ + ARG_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE, + ARG_PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MEM, + ARG_PTR_TO_CTX_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_CTX, + ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, + ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM, + ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, + /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ -- cgit v1.2.3 From 3c4807322660d4290ac9062c034aed6b87243861 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:46 -0800 Subject: bpf: Replace RET_XXX_OR_NULL with RET_XXX | PTR_MAYBE_NULL We have introduced a new type to make bpf_ret composable, by reserving high bits to represent flags. One of the flag is PTR_MAYBE_NULL, which indicates a pointer may be NULL. When applying this flag to ret_types, it means the returned value could be a NULL pointer. This patch switches the qualified arg_types to use this flag. The ret_types changed in this patch include: 1. RET_PTR_TO_MAP_VALUE_OR_NULL 2. RET_PTR_TO_SOCKET_OR_NULL 3. RET_PTR_TO_TCP_SOCK_OR_NULL 4. RET_PTR_TO_SOCK_COMMON_OR_NULL 5. RET_PTR_TO_ALLOC_MEM_OR_NULL 6. RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL 7. RET_PTR_TO_BTF_ID_OR_NULL This patch doesn't eliminate the use of these names, instead it makes them aliases to 'RET_PTR_TO_XXX | PTR_MAYBE_NULL'. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-4-haoluo@google.com --- include/linux/bpf.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 765bd7cc4272..975a1d5951bd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -382,17 +382,22 @@ enum bpf_return_type { RET_INTEGER, /* function returns integer */ RET_VOID, /* function doesn't return anything */ RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ - RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ - RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ - RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ - RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ - RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ - RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ - RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ + RET_PTR_TO_SOCKET, /* returns a pointer to a socket */ + RET_PTR_TO_TCP_SOCK, /* returns a pointer to a tcp_sock */ + RET_PTR_TO_SOCK_COMMON, /* returns a pointer to a sock_common */ + RET_PTR_TO_ALLOC_MEM, /* returns a pointer to dynamically allocated memory */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ __BPF_RET_TYPE_MAX, + /* Extended ret_types. */ + RET_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE, + RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, + RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, + RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, + RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM, + RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, + /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ -- cgit v1.2.3 From c25b2ae136039ffa820c26138ed4a5e5f3ab3841 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:47 -0800 Subject: bpf: Replace PTR_TO_XXX_OR_NULL with PTR_TO_XXX | PTR_MAYBE_NULL We have introduced a new type to make bpf_reg composable, by allocating bits in the type to represent flags. One of the flags is PTR_MAYBE_NULL which indicates a pointer may be NULL. This patch switches the qualified reg_types to use this flag. The reg_types changed in this patch include: 1. PTR_TO_MAP_VALUE_OR_NULL 2. PTR_TO_SOCKET_OR_NULL 3. PTR_TO_SOCK_COMMON_OR_NULL 4. PTR_TO_TCP_SOCK_OR_NULL 5. PTR_TO_BTF_ID_OR_NULL 6. PTR_TO_MEM_OR_NULL 7. PTR_TO_RDONLY_BUF_OR_NULL 8. PTR_TO_RDWR_BUF_OR_NULL Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211217003152.48334-5-haoluo@google.com --- include/linux/bpf.h | 18 +++++++++--------- include/linux/bpf_verifier.h | 4 ++++ 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 975a1d5951bd..c3de62267b84 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -465,18 +465,15 @@ enum bpf_reg_type { PTR_TO_CTX, /* reg points to bpf_context */ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ PTR_TO_MAP_VALUE, /* reg points to map element value */ - PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ + PTR_TO_MAP_KEY, /* reg points to a map element key */ PTR_TO_STACK, /* reg == frame_pointer + offset */ PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ PTR_TO_SOCKET, /* reg points to struct bpf_sock */ - PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ PTR_TO_SOCK_COMMON, /* reg points to sock_common */ - PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ - PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ /* PTR_TO_BTF_ID points to a kernel struct that does not need @@ -494,18 +491,21 @@ enum bpf_reg_type { * been checked for null. Used primarily to inform the verifier * an explicit null check is required for this struct. */ - PTR_TO_BTF_ID_OR_NULL, PTR_TO_MEM, /* reg points to valid memory region */ - PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */ PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */ - PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */ PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ - PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ PTR_TO_FUNC, /* reg points to a bpf program function */ - PTR_TO_MAP_KEY, /* reg points to a map element key */ __BPF_REG_TYPE_MAX, + /* Extended reg_types. */ + PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MAP_VALUE, + PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCKET, + PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON, + PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK, + PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID, + PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MEM, + /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 34e4ceaca3c7..143401d4c9d9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -18,6 +18,8 @@ * that converting umax_value to int cannot overflow. */ #define BPF_MAX_VAR_SIZ (1 << 29) +/* size of type_str_buf in bpf_verifier. */ +#define TYPE_STR_BUF_LEN 64 /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that @@ -484,6 +486,8 @@ struct bpf_verifier_env { /* Same as scratched_regs but for stack slots */ u64 scratched_stack_slots; u32 prev_log_len, prev_insn_print_len; + /* buffer used in reg_type_str() to generate reg_type string */ + char type_str_buf[TYPE_STR_BUF_LEN]; }; __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, -- cgit v1.2.3 From 20b2aff4bc15bda809f994761d5719827d66c0b4 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:48 -0800 Subject: bpf: Introduce MEM_RDONLY flag This patch introduce a flag MEM_RDONLY to tag a reg value pointing to read-only memory. It makes the following changes: 1. PTR_TO_RDWR_BUF -> PTR_TO_BUF 2. PTR_TO_RDONLY_BUF -> PTR_TO_BUF | MEM_RDONLY Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-6-haoluo@google.com --- include/linux/bpf.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c3de62267b84..126048110bdb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -311,7 +311,10 @@ enum bpf_type_flag { /* PTR may be NULL. */ PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = PTR_MAYBE_NULL, + /* MEM is read-only. */ + MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = MEM_RDONLY, }; /* Max number of base types. */ @@ -492,8 +495,7 @@ enum bpf_reg_type { * an explicit null check is required for this struct. */ PTR_TO_MEM, /* reg points to valid memory region */ - PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */ - PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ + PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ PTR_TO_FUNC, /* reg points to a bpf program function */ __BPF_REG_TYPE_MAX, -- cgit v1.2.3 From cf9f2f8d62eca810afbd1ee6cc0800202b000e57 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:49 -0800 Subject: bpf: Convert PTR_TO_MEM_OR_NULL to composable types. Remove PTR_TO_MEM_OR_NULL and replace it with PTR_TO_MEM combined with flag PTR_MAYBE_NULL. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-7-haoluo@google.com --- include/linux/bpf.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 126048110bdb..567d83bf28f9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -506,7 +506,6 @@ enum bpf_reg_type { PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON, PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK, PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID, - PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MEM, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. -- cgit v1.2.3 From 216e3cd2f28dbbf1fe86848e0e29e6693b9f0a20 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:51 -0800 Subject: bpf: Add MEM_RDONLY for helper args that are pointers to rdonly mem. Some helper functions may modify its arguments, for example, bpf_d_path, bpf_get_stack etc. Previously, their argument types were marked as ARG_PTR_TO_MEM, which is compatible with read-only mem types, such as PTR_TO_RDONLY_BUF. Therefore it's legitimate, but technically incorrect, to modify a read-only memory by passing it into one of such helper functions. This patch tags the bpf_args compatible with immutable memory with MEM_RDONLY flag. The arguments that don't have this flag will be only compatible with mutable memory types, preventing the helper from modifying a read-only memory. The bpf_args that have MEM_RDONLY are compatible with both mutable memory and immutable memory. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-9-haoluo@google.com --- include/linux/bpf.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 567d83bf28f9..26753139d5b4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -311,7 +311,9 @@ enum bpf_type_flag { /* PTR may be NULL. */ PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS), - /* MEM is read-only. */ + /* MEM is read-only. When applied on bpf_arg, it indicates the arg is + * compatible with both mutable and immutable memory. + */ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), __BPF_TYPE_LAST_FLAG = MEM_RDONLY, -- cgit v1.2.3 From 8cbfe939abe905280279e84a297b1cb34e0d0ec9 Mon Sep 17 00:00:00 2001 From: Baowen Zheng Date: Fri, 17 Dec 2021 19:16:22 +0100 Subject: flow_offload: allow user to offload tc action to net device Use flow_indr_dev_register/flow_indr_dev_setup_offload to offload tc action. We need to call tc_cleanup_flow_action to clean up tc action entry since in tc_setup_action, some actions may hold dev refcnt, especially the mirror action. Signed-off-by: Baowen Zheng Signed-off-by: Louis Peens Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a419718612c6..8b0bdeb4734e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -920,6 +920,7 @@ enum tc_setup_type { TC_SETUP_QDISC_TBF, TC_SETUP_QDISC_FIFO, TC_SETUP_QDISC_HTB, + TC_SETUP_ACT, }; /* These structures hold the attributes of bpf state that are being passed -- cgit v1.2.3 From 7d4203c13435c0bdae61bf16bbd0408d5b958ade Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 25 Nov 2021 18:15:37 +0100 Subject: mm: add virt_to_folio() and folio_address() These two wrappers around their respective struct page variants will be useful in the following patches. Signed-off-by: Vlastimil Babka Acked-by: Johannes Weiner Reviewed-by: Roman Gushchin --- include/linux/mm.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a7e4a9e7d807..4a6cf22483da 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -863,6 +863,13 @@ static inline struct page *virt_to_head_page(const void *x) return compound_head(page); } +static inline struct folio *virt_to_folio(const void *x) +{ + struct page *page = virt_to_page(x); + + return page_folio(page); +} + void __put_page(struct page *page); void put_pages_list(struct list_head *pages); @@ -1753,6 +1760,11 @@ void page_address_init(void); #define page_address_init() do { } while(0) #endif +static inline void *folio_address(const struct folio *folio) +{ + return page_address(&folio->page); +} + extern void *page_rmapping(struct page *page); extern struct anon_vma *page_anon_vma(struct page *page); extern pgoff_t __page_file_index(struct page *page); -- cgit v1.2.3 From d5c383f2c98ac58c210b266cdaf7b86bc32d1ad1 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 17 Dec 2021 15:30:56 +0000 Subject: iommu/iova: Squash entry_dtor abstraction All flush queues are driven by iommu-dma now, so there is no need to abstract entry_dtor or its data any more. Squash the now-canonical implementation directly into the IOVA code to get it out of the way. Reviewed-by: John Garry Reviewed-by: Christoph Hellwig Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/2260f8de00ab5e0f9d2a1cf8978e6ae7cd4f182c.1639753638.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iova.h | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iova.h b/include/linux/iova.h index 71d8a2de6635..e746d8e41449 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -40,9 +40,6 @@ struct iova_domain; /* Call-Back from IOVA code into IOMMU drivers */ typedef void (* iova_flush_cb)(struct iova_domain *domain); -/* Destructor for per-entry data */ -typedef void (* iova_entry_dtor)(unsigned long data); - /* Number of entries per Flush Queue */ #define IOVA_FQ_SIZE 256 @@ -53,7 +50,7 @@ typedef void (* iova_entry_dtor)(unsigned long data); struct iova_fq_entry { unsigned long iova_pfn; unsigned long pages; - unsigned long data; + struct page *freelist; u64 counter; /* Flush counter when this entrie was added */ }; @@ -88,9 +85,6 @@ struct iova_domain { iova_flush_cb flush_cb; /* Call-Back function to flush IOMMU TLBs */ - iova_entry_dtor entry_dtor; /* IOMMU driver specific destructor for - iova entry */ - struct timer_list fq_timer; /* Timer to regularily empty the flush-queues */ atomic_t fq_timer_on; /* 1 when timer is active, 0 @@ -146,15 +140,14 @@ void free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size); void queue_iova(struct iova_domain *iovad, unsigned long pfn, unsigned long pages, - unsigned long data); + struct page *freelist); unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size, unsigned long limit_pfn, bool flush_rcache); struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo, unsigned long pfn_hi); void init_iova_domain(struct iova_domain *iovad, unsigned long granule, unsigned long start_pfn); -int init_iova_flush_queue(struct iova_domain *iovad, - iova_flush_cb flush_cb, iova_entry_dtor entry_dtor); +int init_iova_flush_queue(struct iova_domain *iovad, iova_flush_cb flush_cb); struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn); void put_iova_domain(struct iova_domain *iovad); #else @@ -189,12 +182,6 @@ static inline void free_iova_fast(struct iova_domain *iovad, { } -static inline void queue_iova(struct iova_domain *iovad, - unsigned long pfn, unsigned long pages, - unsigned long data) -{ -} - static inline unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size, unsigned long limit_pfn, @@ -216,13 +203,6 @@ static inline void init_iova_domain(struct iova_domain *iovad, { } -static inline int init_iova_flush_queue(struct iova_domain *iovad, - iova_flush_cb flush_cb, - iova_entry_dtor entry_dtor) -{ - return -ENODEV; -} - static inline struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn) { -- cgit v1.2.3 From 649ad9835a3783bcb6c69368fa939e0010abb2c6 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 17 Dec 2021 15:30:57 +0000 Subject: iommu/iova: Squash flush_cb abstraction Once again, with iommu-dma now being the only flush queue user, we no longer need the extra level of indirection through flush_cb. Squash that and let the flush queue code call the domain method directly. This does mean temporarily having to carry an additional copy of the IOMMU domain pointer around instead, but only until a later patch untangles it again. Reviewed-by: John Garry Reviewed-by: Christoph Hellwig Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/e3f9b4acdd6640012ef4fbc819ac868d727b64a9.1639753638.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iova.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iova.h b/include/linux/iova.h index e746d8e41449..99be4fcea4f3 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -14,6 +14,7 @@ #include #include #include +#include /* iova structure */ struct iova { @@ -35,11 +36,6 @@ struct iova_rcache { struct iova_cpu_rcache __percpu *cpu_rcaches; }; -struct iova_domain; - -/* Call-Back from IOVA code into IOMMU drivers */ -typedef void (* iova_flush_cb)(struct iova_domain *domain); - /* Number of entries per Flush Queue */ #define IOVA_FQ_SIZE 256 @@ -82,8 +78,7 @@ struct iova_domain { struct iova anchor; /* rbtree lookup anchor */ struct iova_rcache rcaches[IOVA_RANGE_CACHE_MAX_SIZE]; /* IOVA range caches */ - iova_flush_cb flush_cb; /* Call-Back function to flush IOMMU - TLBs */ + struct iommu_domain *fq_domain; struct timer_list fq_timer; /* Timer to regularily empty the flush-queues */ @@ -147,7 +142,7 @@ struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo, unsigned long pfn_hi); void init_iova_domain(struct iova_domain *iovad, unsigned long granule, unsigned long start_pfn); -int init_iova_flush_queue(struct iova_domain *iovad, iova_flush_cb flush_cb); +int init_iova_flush_queue(struct iova_domain *iovad, struct iommu_domain *fq_domain); struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn); void put_iova_domain(struct iova_domain *iovad); #else -- cgit v1.2.3 From 87f60cc65d24939353b40aa1d9297fea080cdf8d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Dec 2021 15:31:00 +0000 Subject: iommu/vt-d: Use put_pages_list page->freelist is for the use of slab. We already have the ability to free a list of pages in the core mm, but it requires the use of a list_head and for the pages to be chained together through page->lru. Switch the Intel IOMMU and IOVA code over to using free_pages_list(). Signed-off-by: Matthew Wilcox (Oracle) [rm: split from original patch, cosmetic tweaks, fix fq entries] Signed-off-by: Robin Murphy Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/2115b560d9a0ce7cd4b948bd51a2b7bde8fdfd59.1639753638.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 3 ++- include/linux/iova.h | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index d2f3435e7d17..de0c57a567c8 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -186,7 +186,7 @@ struct iommu_iotlb_gather { unsigned long start; unsigned long end; size_t pgsize; - struct page *freelist; + struct list_head freelist; bool queued; }; @@ -399,6 +399,7 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather) { *gather = (struct iommu_iotlb_gather) { .start = ULONG_MAX, + .freelist = LIST_HEAD_INIT(gather->freelist), }; } diff --git a/include/linux/iova.h b/include/linux/iova.h index 99be4fcea4f3..072a09c06e8a 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -46,7 +46,7 @@ struct iova_rcache { struct iova_fq_entry { unsigned long iova_pfn; unsigned long pages; - struct page *freelist; + struct list_head freelist; u64 counter; /* Flush counter when this entrie was added */ }; @@ -135,7 +135,7 @@ void free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size); void queue_iova(struct iova_domain *iovad, unsigned long pfn, unsigned long pages, - struct page *freelist); + struct list_head *freelist); unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size, unsigned long limit_pfn, bool flush_rcache); struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo, -- cgit v1.2.3 From a17e3026bc4da9135ca9a42ec0b1fa67f95172e3 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 17 Dec 2021 15:31:03 +0000 Subject: iommu: Move flush queue data into iommu_dma_cookie Complete the move into iommu-dma by refactoring the flush queues themselves to belong to the DMA cookie rather than the IOVA domain. The refactoring may as well extend to some minor cosmetic aspects too, to help us stay one step ahead of the style police. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/24304722005bc6f144e2a1fdd865d1465722fc2e.1639753638.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iova.h | 44 +------------------------------------------- 1 file changed, 1 insertion(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iova.h b/include/linux/iova.h index 072a09c06e8a..0abd48c5e622 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -12,9 +12,6 @@ #include #include #include -#include -#include -#include /* iova structure */ struct iova { @@ -36,27 +33,6 @@ struct iova_rcache { struct iova_cpu_rcache __percpu *cpu_rcaches; }; -/* Number of entries per Flush Queue */ -#define IOVA_FQ_SIZE 256 - -/* Timeout (in ms) after which entries are flushed from the Flush-Queue */ -#define IOVA_FQ_TIMEOUT 10 - -/* Flush Queue entry for defered flushing */ -struct iova_fq_entry { - unsigned long iova_pfn; - unsigned long pages; - struct list_head freelist; - u64 counter; /* Flush counter when this entrie was added */ -}; - -/* Per-CPU Flush Queue structure */ -struct iova_fq { - struct iova_fq_entry entries[IOVA_FQ_SIZE]; - unsigned head, tail; - spinlock_t lock; -}; - /* holds all the iova translations for a domain */ struct iova_domain { spinlock_t iova_rbtree_lock; /* Lock to protect update of rbtree */ @@ -67,23 +43,9 @@ struct iova_domain { unsigned long start_pfn; /* Lower limit for this domain */ unsigned long dma_32bit_pfn; unsigned long max32_alloc_size; /* Size of last failed allocation */ - struct iova_fq __percpu *fq; /* Flush Queue */ - - atomic64_t fq_flush_start_cnt; /* Number of TLB flushes that - have been started */ - - atomic64_t fq_flush_finish_cnt; /* Number of TLB flushes that - have been finished */ - struct iova anchor; /* rbtree lookup anchor */ - struct iova_rcache rcaches[IOVA_RANGE_CACHE_MAX_SIZE]; /* IOVA range caches */ - - struct iommu_domain *fq_domain; - struct timer_list fq_timer; /* Timer to regularily empty the - flush-queues */ - atomic_t fq_timer_on; /* 1 when timer is active, 0 - when not */ + struct iova_rcache rcaches[IOVA_RANGE_CACHE_MAX_SIZE]; /* IOVA range caches */ struct hlist_node cpuhp_dead; }; @@ -133,16 +95,12 @@ struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size, bool size_aligned); void free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size); -void queue_iova(struct iova_domain *iovad, - unsigned long pfn, unsigned long pages, - struct list_head *freelist); unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size, unsigned long limit_pfn, bool flush_rcache); struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo, unsigned long pfn_hi); void init_iova_domain(struct iova_domain *iovad, unsigned long granule, unsigned long start_pfn); -int init_iova_flush_queue(struct iova_domain *iovad, struct iommu_domain *fq_domain); struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn); void put_iova_domain(struct iova_domain *iovad); #else -- cgit v1.2.3 From 5bc9a9dd75351023793d8aa4116ead005d659729 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sun, 19 Dec 2021 21:51:24 +0200 Subject: rfkill: allow to get the software rfkill state iwlwifi needs to be able to differentiate between the software rfkill state and the hardware rfkill state. The reason for this is that iwlwifi needs to notify any change in the software rfkill state even when it doesn't own the device (which means even when the hardware rfkill is asserted). In order to be able to know the software rfkill when the host does not own the device, iwlwifi needs to be able to ask the state of the software rfkill ignoring the state of the hardware rfkill. Signed-off-by: Emmanuel Grumbach Link: https://lore.kernel.org/r/20211219195124.125689-1-emmanuel.grumbach@intel.com Signed-off-by: Johannes Berg --- include/linux/rfkill.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h index 231e06b74b50..c35f3962dc4f 100644 --- a/include/linux/rfkill.h +++ b/include/linux/rfkill.h @@ -229,6 +229,13 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw); */ bool rfkill_blocked(struct rfkill *rfkill); +/** + * rfkill_soft_blocked - Query soft rfkill block state + * + * @rfkill: rfkill struct to query + */ +bool rfkill_soft_blocked(struct rfkill *rfkill); + /** * rfkill_find_type - Helper for finding rfkill type by name * @name: the name of the type -- cgit v1.2.3 From aade40b62745cf0b4e8a17d43652c5faff354e6b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 20 Dec 2021 13:34:48 +0100 Subject: iommu/iova: Temporarily include dma-mapping.h from iova.h Some users of iova.h still expect that dma-mapping.h is also included. Re-add the include until these users are updated to fix compile failures in the iommu tree. Acked-by: Robin Murphy Link: https://lore.kernel.org/r/20211220123448.19996-1-joro@8bytes.org Signed-off-by: Joerg Roedel --- include/linux/iova.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iova.h b/include/linux/iova.h index 0abd48c5e622..cea79cb9f26c 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -12,6 +12,7 @@ #include #include #include +#include /* iova structure */ struct iova { -- cgit v1.2.3 From 59f37b7370ef56e6faf25d0e18bc597a0af40bb8 Mon Sep 17 00:00:00 2001 From: Sam Protsenko Date: Sat, 4 Dec 2021 21:57:55 +0200 Subject: tty: serial: samsung: Remove USI initialization USI control is now extracted to the dedicated USI driver. Remove USI related code from serial driver to avoid conflicts and code duplication. Signed-off-by: Sam Protsenko Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20211204195757.8600-4-semen.protsenko@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_s3c.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_s3c.h b/include/linux/serial_s3c.h index cf0de4a86640..f6c3323fc4c5 100644 --- a/include/linux/serial_s3c.h +++ b/include/linux/serial_s3c.h @@ -27,15 +27,6 @@ #define S3C2410_UERSTAT (0x14) #define S3C2410_UFSTAT (0x18) #define S3C2410_UMSTAT (0x1C) -#define USI_CON (0xC4) -#define USI_OPTION (0xC8) - -#define USI_CON_RESET (1<<0) -#define USI_CON_RESET_MASK (1<<0) - -#define USI_OPTION_HWACG_CLKREQ_ON (1<<1) -#define USI_OPTION_HWACG_CLKSTOP_ON (1<<2) -#define USI_OPTION_HWACG_MASK (3<<1) #define S3C2410_LCON_CFGMASK ((0xF<<3)|(0x3)) -- cgit v1.2.3 From 1a5e91d8375fc8369207cc0b9894a324f2bbf1d9 Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Mon, 13 Dec 2021 02:14:02 -0500 Subject: swiotlb: Add swiotlb bounce buffer remap function for HV IVM In Isolation VM with AMD SEV, bounce buffer needs to be accessed via extra address space which is above shared_gpa_boundary (E.G 39 bit address line) reported by Hyper-V CPUID ISOLATION_CONFIG. The access physical address will be original physical address + shared_gpa_boundary. The shared_gpa_boundary in the AMD SEV SNP spec is called virtual top of memory(vTOM). Memory addresses below vTOM are automatically treated as private while memory above vTOM is treated as shared. Expose swiotlb_unencrypted_base for platforms to set unencrypted memory base offset and platform calls swiotlb_update_mem_attributes() to remap swiotlb mem to unencrypted address space. memremap() can not be called in the early stage and so put remapping code into swiotlb_update_mem_attributes(). Store remap address and use it to copy data from/to swiotlb bounce buffer. Signed-off-by: Tianyu Lan Acked-by: Christoph Hellwig Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20211213071407.314309-2-ltykernel@gmail.com Signed-off-by: Wei Liu --- include/linux/swiotlb.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 569272871375..f6c3638255d5 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -73,6 +73,9 @@ extern enum swiotlb_force swiotlb_force; * @end: The end address of the swiotlb memory pool. Used to do a quick * range check to see if the memory was in fact allocated by this * API. + * @vaddr: The vaddr of the swiotlb memory pool. The swiotlb memory pool + * may be remapped in the memory encrypted case and store virtual + * address for bounce buffer operation. * @nslabs: The number of IO TLB blocks (in groups of 64) between @start and * @end. For default swiotlb, this is command line adjustable via * setup_io_tlb_npages. @@ -92,6 +95,7 @@ extern enum swiotlb_force swiotlb_force; struct io_tlb_mem { phys_addr_t start; phys_addr_t end; + void *vaddr; unsigned long nslabs; unsigned long used; unsigned int index; @@ -186,4 +190,6 @@ static inline bool is_swiotlb_for_alloc(struct device *dev) } #endif /* CONFIG_DMA_RESTRICTED_POOL */ +extern phys_addr_t swiotlb_unencrypted_base; + #endif /* __LINUX_SWIOTLB_H */ -- cgit v1.2.3 From 743b237c3a7b0f5b44aa704aae8a1058877b6322 Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Mon, 13 Dec 2021 02:14:05 -0500 Subject: scsi: storvsc: Add Isolation VM support for storvsc driver In Isolation VM, all shared memory with host needs to mark visible to host via hvcall. vmbus_establish_gpadl() has already done it for storvsc rx/tx ring buffer. The page buffer used by vmbus_sendpacket_ mpb_desc() still needs to be handled. Use DMA API(scsi_dma_map/unmap) to map these memory during sending/receiving packet and return swiotlb bounce buffer dma address. In Isolation VM, swiotlb bounce buffer is marked to be visible to host and the swiotlb force mode is enabled. Set device's dma min align mask to HV_HYP_PAGE_SIZE - 1 in order to keep the original data offset in the bounce buffer. Signed-off-by: Tianyu Lan Reviewed-by: Long Li Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20211213071407.314309-5-ltykernel@gmail.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index b823311eac79..650a0574b746 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1261,6 +1261,7 @@ struct hv_device { struct vmbus_channel *channel; struct kset *channels_kset; + struct device_dma_parameters dma_parms; /* place holder to keep track of the dir for hv device in debugfs */ struct dentry *debug_dir; -- cgit v1.2.3 From 846da38de0e8224f2f94b885125cf1fd2d7b0d39 Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Mon, 13 Dec 2021 02:14:06 -0500 Subject: net: netvsc: Add Isolation VM support for netvsc driver In Isolation VM, all shared memory with host needs to mark visible to host via hvcall. vmbus_establish_gpadl() has already done it for netvsc rx/tx ring buffer. The page buffer used by vmbus_sendpacket_ pagebuffer() stills need to be handled. Use DMA API to map/umap these memory during sending/receiving packet and Hyper-V swiotlb bounce buffer dma address will be returned. The swiotlb bounce buffer has been masked to be visible to host during boot up. rx/tx ring buffer is allocated via vzalloc() and they need to be mapped into unencrypted address space(above vTOM) before sharing with host and accessing. Add hv_map/unmap_memory() to map/umap rx /tx ring buffer. Signed-off-by: Tianyu Lan Reviewed-by: Haiyang Zhang Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20211213071407.314309-6-ltykernel@gmail.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 650a0574b746..f565a8938836 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1584,6 +1584,11 @@ struct hyperv_service_callback { void (*callback)(void *context); }; +struct hv_dma_range { + dma_addr_t dma; + u32 mapping_size; +}; + #define MAX_SRV_VER 0x7ffffff extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf, u32 buflen, const int *fw_version, int fw_vercnt, -- cgit v1.2.3 From 6fc61c39ee1adb5f4115d288c876772fcd8b6979 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Sun, 21 Nov 2021 01:20:46 +0100 Subject: soc: qcom: llcc: Add configuration data for SM8350 Add LLCC configuration data for SM8350 SoC. Signed-off-by: Konrad Dybcio Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20211121002050.36977-2-konrad.dybcio@somainline.org --- include/linux/soc/qcom/llcc-qcom.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h index 437c9df13229..9e8fd92c96b7 100644 --- a/include/linux/soc/qcom/llcc-qcom.h +++ b/include/linux/soc/qcom/llcc-qcom.h @@ -33,6 +33,9 @@ #define LLCC_MODPE 29 #define LLCC_APTCM 30 #define LLCC_WRCACHE 31 +#define LLCC_CVPFW 32 +#define LLCC_CPUSS1 33 +#define LLCC_CPUHWT 36 /** * struct llcc_slice_desc - Cache slice descriptor -- cgit v1.2.3 From 7e5cced9ca84df52d874aca6b632f930b3dc5bc6 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 20 Dec 2021 09:49:01 -0500 Subject: net: accept UFOv6 packages in virtio_net_hdr_to_skb Skb with skb->protocol 0 at the time of virtio_net_hdr_to_skb may have a protocol inferred from virtio_net_hdr with virtio_net_hdr_set_proto. Unlike TCP, UDP does not have separate types for IPv4 and IPv6. Type VIRTIO_NET_HDR_GSO_UDP is guessed to be IPv4/UDP. As of the below commit, UFOv6 packets are dropped due to not matching the protocol as obtained from dev_parse_header_protocol. Invert the test to take that L2 protocol field as starting point and pass both UFOv4 and UFOv6 for VIRTIO_NET_HDR_GSO_UDP. Fixes: 924a9bc362a5 ("net: check if protocol extracted by virtio_net_hdr_set_proto is correct") Link: https://lore.kernel.org/netdev/CABcq3pG9GRCYqFDBAJ48H1vpnnX=41u+MhQnayF1ztLH4WX0Fw@mail.gmail.com/ Reported-by: Andrew Melnichenko Signed-off-by: Willem de Bruijn Link: https://lore.kernel.org/r/20211220144901.2784030-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/virtio_net.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 04e87f4b9417..22dd48c82560 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -7,6 +7,21 @@ #include #include +static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type) +{ + switch (gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + return protocol == cpu_to_be16(ETH_P_IP); + case VIRTIO_NET_HDR_GSO_TCPV6: + return protocol == cpu_to_be16(ETH_P_IPV6); + case VIRTIO_NET_HDR_GSO_UDP: + return protocol == cpu_to_be16(ETH_P_IP) || + protocol == cpu_to_be16(ETH_P_IPV6); + default: + return false; + } +} + static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, const struct virtio_net_hdr *hdr) { @@ -88,9 +103,12 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, if (!skb->protocol) { __be16 protocol = dev_parse_header_protocol(skb); - virtio_net_hdr_set_proto(skb, hdr); - if (protocol && protocol != skb->protocol) + if (!protocol) + virtio_net_hdr_set_proto(skb, hdr); + else if (!virtio_net_hdr_match_proto(protocol, hdr->gso_type)) return -EINVAL; + else + skb->protocol = protocol; } retry: if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, -- cgit v1.2.3 From 1ed1d592113959f00cc552c3b9f47ca2d157768f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 20 Dec 2021 09:50:27 -0500 Subject: net: skip virtio_net_hdr_set_proto if protocol already set virtio_net_hdr_set_proto infers skb->protocol from the virtio_net_hdr gso_type, to avoid packets getting dropped for lack of a proto type. Its protocol choice is a guess, especially in the case of UFO, where the single VIRTIO_NET_HDR_GSO_UDP label covers both UFOv4 and UFOv6. Skip this best effort if the field is already initialized. Whether explicitly from userspace, or implicitly based on an earlier call to dev_parse_header_protocol (which is more robust, but was introduced after this patch). Fixes: 9d2f67e43b73 ("net/packet: fix packet drop as of virtio gso") Signed-off-by: Willem de Bruijn Link: https://lore.kernel.org/r/20211220145027.2784293-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/virtio_net.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 22dd48c82560..a960de68ac69 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -25,6 +25,9 @@ static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type) static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, const struct virtio_net_hdr *hdr) { + if (skb->protocol) + return 0; + switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: case VIRTIO_NET_HDR_GSO_UDP: -- cgit v1.2.3 From 80a5ca99c5c04be6777df225ab932142a9d60c3f Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 16 Dec 2021 11:33:00 +0800 Subject: rapidio: remove not used macro definition in rio_ids.h The definition of RIO_VID_FREESCALE, RIO_DID_MPC8560, RIO_DID_TSI500, RIO_DID_TSI576 and RIO_DID_TSI721 are not used for many years in the current code, so just remove them. Signed-off-by: Tiezhu Yang Link: https://lore.kernel.org/r/1639625581-22867-2-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Greg Kroah-Hartman --- include/linux/rio_ids.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rio_ids.h b/include/linux/rio_ids.h index 4846f72759b2..e74d8840708a 100644 --- a/include/linux/rio_ids.h +++ b/include/linux/rio_ids.h @@ -9,15 +9,10 @@ #ifndef LINUX_RIO_IDS_H #define LINUX_RIO_IDS_H -#define RIO_VID_FREESCALE 0x0002 -#define RIO_DID_MPC8560 0x0003 - #define RIO_VID_TUNDRA 0x000d -#define RIO_DID_TSI500 0x0500 #define RIO_DID_TSI568 0x0568 #define RIO_DID_TSI572 0x0572 #define RIO_DID_TSI574 0x0574 -#define RIO_DID_TSI576 0x0578 /* Same ID as Tsi578 */ #define RIO_DID_TSI577 0x0577 #define RIO_DID_TSI578 0x0578 @@ -33,7 +28,6 @@ #define RIO_DID_IDTCPS1616 0x0379 #define RIO_DID_IDTVPS1616 0x0377 #define RIO_DID_IDTSPS1616 0x0378 -#define RIO_DID_TSI721 0x80ab #define RIO_DID_IDTRXS1632 0x80e5 #define RIO_DID_IDTRXS2448 0x80e6 -- cgit v1.2.3 From 612d4904191ff9aca01b1e087d8687b3a223cb33 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 16 Dec 2021 11:33:01 +0800 Subject: rapidio: remove not used code about RIO_VID_TUNDRA According to https://rapidio.org/vendor-id/, there is no 0x000d vendor id in the complete and current list of VendorIDs, it means that the related code is dead code now, so just remove them. Signed-off-by: Tiezhu Yang Link: https://lore.kernel.org/r/1639625581-22867-3-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Greg Kroah-Hartman --- include/linux/rio_ids.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rio_ids.h b/include/linux/rio_ids.h index e74d8840708a..c7e2f21dd5c1 100644 --- a/include/linux/rio_ids.h +++ b/include/linux/rio_ids.h @@ -9,13 +9,6 @@ #ifndef LINUX_RIO_IDS_H #define LINUX_RIO_IDS_H -#define RIO_VID_TUNDRA 0x000d -#define RIO_DID_TSI568 0x0568 -#define RIO_DID_TSI572 0x0572 -#define RIO_DID_TSI574 0x0574 -#define RIO_DID_TSI577 0x0577 -#define RIO_DID_TSI578 0x0578 - #define RIO_VID_IDT 0x0038 #define RIO_DID_IDT70K200 0x0310 #define RIO_DID_IDTCPS8 0x035c -- cgit v1.2.3 From 0032ca576a79946492194ae4860b462d32815c66 Mon Sep 17 00:00:00 2001 From: Yanteng Si Date: Tue, 21 Dec 2021 17:16:46 +0900 Subject: counter: Add the necessary colons and indents to the comments of counter_compi Since commit aaec1a0f76ec ("counter: Internalize sysfs interface code") introduce a warning as: linux-next/Documentation/driver-api/generic-counter:234: ./include/linux/counter.h:43: WARNING: Unexpected indentation. linux-next/Documentation/driver-api/generic-counter:234: ./include/linux/counter.h:45: WARNING: Block quote ends without a blank line; unexpected unindent. Add the necessary colons and indents. Fixes: aaec1a0f76ec ("counter: Internalize sysfs interface code") Signed-off-by: Yanteng Si Signed-off-by: William Breathitt Gray Link: https://lore.kernel.org/r/26011e814d6eca02c7ebdbb92f171a49928a7e89.1640072891.git.vilhelm.gray@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/counter.h | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/counter.h b/include/linux/counter.h index b7d0a00a61cf..dfbde2808998 100644 --- a/include/linux/counter.h +++ b/include/linux/counter.h @@ -38,64 +38,64 @@ enum counter_comp_type { * @type: Counter component data type * @name: device-specific component name * @priv: component-relevant data - * @action_read Synapse action mode read callback. The read value of the + * @action_read: Synapse action mode read callback. The read value of the * respective Synapse action mode should be passed back via * the action parameter. - * @device_u8_read Device u8 component read callback. The read value of the + * @device_u8_read: Device u8 component read callback. The read value of the * respective Device u8 component should be passed back via * the val parameter. - * @count_u8_read Count u8 component read callback. The read value of the + * @count_u8_read: Count u8 component read callback. The read value of the * respective Count u8 component should be passed back via * the val parameter. - * @signal_u8_read Signal u8 component read callback. The read value of the + * @signal_u8_read: Signal u8 component read callback. The read value of the * respective Signal u8 component should be passed back via * the val parameter. - * @device_u32_read Device u32 component read callback. The read value of + * @device_u32_read: Device u32 component read callback. The read value of * the respective Device u32 component should be passed * back via the val parameter. - * @count_u32_read Count u32 component read callback. The read value of the + * @count_u32_read: Count u32 component read callback. The read value of the * respective Count u32 component should be passed back via * the val parameter. - * @signal_u32_read Signal u32 component read callback. The read value of + * @signal_u32_read: Signal u32 component read callback. The read value of * the respective Signal u32 component should be passed * back via the val parameter. - * @device_u64_read Device u64 component read callback. The read value of + * @device_u64_read: Device u64 component read callback. The read value of * the respective Device u64 component should be passed * back via the val parameter. - * @count_u64_read Count u64 component read callback. The read value of the + * @count_u64_read: Count u64 component read callback. The read value of the * respective Count u64 component should be passed back via * the val parameter. - * @signal_u64_read Signal u64 component read callback. The read value of + * @signal_u64_read: Signal u64 component read callback. The read value of * the respective Signal u64 component should be passed * back via the val parameter. - * @action_write Synapse action mode write callback. The write value of + * @action_write: Synapse action mode write callback. The write value of * the respective Synapse action mode is passed via the * action parameter. - * @device_u8_write Device u8 component write callback. The write value of + * @device_u8_write: Device u8 component write callback. The write value of * the respective Device u8 component is passed via the val * parameter. - * @count_u8_write Count u8 component write callback. The write value of + * @count_u8_write: Count u8 component write callback. The write value of * the respective Count u8 component is passed via the val * parameter. - * @signal_u8_write Signal u8 component write callback. The write value of + * @signal_u8_write: Signal u8 component write callback. The write value of * the respective Signal u8 component is passed via the val * parameter. - * @device_u32_write Device u32 component write callback. The write value of + * @device_u32_write: Device u32 component write callback. The write value of * the respective Device u32 component is passed via the * val parameter. - * @count_u32_write Count u32 component write callback. The write value of + * @count_u32_write: Count u32 component write callback. The write value of * the respective Count u32 component is passed via the val * parameter. - * @signal_u32_write Signal u32 component write callback. The write value of + * @signal_u32_write: Signal u32 component write callback. The write value of * the respective Signal u32 component is passed via the * val parameter. - * @device_u64_write Device u64 component write callback. The write value of + * @device_u64_write: Device u64 component write callback. The write value of * the respective Device u64 component is passed via the * val parameter. - * @count_u64_write Count u64 component write callback. The write value of + * @count_u64_write: Count u64 component write callback. The write value of * the respective Count u64 component is passed via the val * parameter. - * @signal_u64_write Signal u64 component write callback. The write value of + * @signal_u64_write: Signal u64 component write callback. The write value of * the respective Signal u64 component is passed via the * val parameter. */ -- cgit v1.2.3 From 79f1c7304295bbbc611bc53cfd5425b777b3e840 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 9 Dec 2021 14:30:08 +0200 Subject: kernfs: Replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. Acked-by: Tejun Heo Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211209123008.3391-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/kernfs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 9f650986a81b..861c4f0f8a29 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -6,7 +6,6 @@ #ifndef __LINUX_KERNFS_H #define __LINUX_KERNFS_H -#include #include #include #include @@ -14,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -23,6 +24,7 @@ struct dentry; struct iattr; struct seq_file; struct vm_area_struct; +struct vm_operations_struct; struct super_block; struct file_system_type; struct poll_table_struct; -- cgit v1.2.3 From eca6e2d4a4a4b824f055eeaaa24f1c2327fb91a2 Mon Sep 17 00:00:00 2001 From: Anand Ashok Dumbre Date: Fri, 3 Dec 2021 21:23:54 +0000 Subject: device property: Add fwnode_iomap() This patch introduces a new helper routine - fwnode_iomap(), which allows to map the memory mapped IO for a given device node. This implementation does not cover the ACPI case and may be expanded in the future. The main purpose here is to be able to develop resource provider agnostic drivers. Suggested-by: Andy Shevchenko Signed-off-by: Anand Ashok Dumbre Reviewed-by: Andy Shevchenko Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20211203212358.31444-2-anand.ashok.dumbre@xilinx.com Signed-off-by: Jonathan Cameron --- include/linux/property.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 88fa726a76df..6670d5a1ec2a 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -122,6 +122,8 @@ void fwnode_handle_put(struct fwnode_handle *fwnode); int fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index); +void __iomem *fwnode_iomap(struct fwnode_handle *fwnode, int index); + unsigned int device_get_child_node_count(struct device *dev); static inline bool device_property_read_bool(struct device *dev, -- cgit v1.2.3 From 1b0b6cc8030d08d2a24e9e5f85dc36c5a58200ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 24 Nov 2021 00:27:01 +0100 Subject: power: supply: add charge_behaviour attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This a revised version of "[RFC] add standardized attributes for force_discharge and inhibit_charge" [0], incorporating discussion results. The biggest change is the switch from two boolean attributes to a single enum attribute. [0] https://lore.kernel.org/platform-driver-x86/21569a89-8303-8573-05fb-c2fec29983d1@gmail.com/ Signed-off-by: Thomas Weißschuh Acked-by: Sebastian Reichel Link: https://lore.kernel.org/r/20211123232704.25394-2-linux@weissschuh.net Signed-off-by: Hans de Goede --- include/linux/power_supply.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 9ca1f120a211..70c333e86293 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -132,6 +132,7 @@ enum power_supply_property { POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT_MAX, POWER_SUPPLY_PROP_CHARGE_CONTROL_START_THRESHOLD, /* in percents! */ POWER_SUPPLY_PROP_CHARGE_CONTROL_END_THRESHOLD, /* in percents! */ + POWER_SUPPLY_PROP_CHARGE_BEHAVIOUR, POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT, POWER_SUPPLY_PROP_INPUT_VOLTAGE_LIMIT, POWER_SUPPLY_PROP_INPUT_POWER_LIMIT, @@ -202,6 +203,12 @@ enum power_supply_usb_type { POWER_SUPPLY_USB_TYPE_APPLE_BRICK_ID, /* Apple Charging Method */ }; +enum power_supply_charge_behaviour { + POWER_SUPPLY_CHARGE_BEHAVIOUR_AUTO = 0, + POWER_SUPPLY_CHARGE_BEHAVIOUR_INHIBIT_CHARGE, + POWER_SUPPLY_CHARGE_BEHAVIOUR_FORCE_DISCHARGE, +}; + enum power_supply_notifier_events { PSY_EVENT_PROP_CHANGED, }; -- cgit v1.2.3 From 539b9c94ac83563842a27e8cc3de5164b15c4de0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 24 Nov 2021 00:27:02 +0100 Subject: power: supply: add helpers for charge_behaviour sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These helper functions can be used by drivers to implement their own sysfs-attributes. This is useful for ACPI-drivers extending the default ACPI-battery with their own charge_behaviour attributes. Signed-off-by: Thomas Weißschuh Acked-by: Sebastian Reichel Link: https://lore.kernel.org/r/20211123232704.25394-3-linux@weissschuh.net Signed-off-by: Hans de Goede --- include/linux/power_supply.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 70c333e86293..71f0379c2af8 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -546,4 +546,13 @@ static inline void power_supply_remove_hwmon_sysfs(struct power_supply *psy) {} #endif +#ifdef CONFIG_SYSFS +ssize_t power_supply_charge_behaviour_show(struct device *dev, + unsigned int available_behaviours, + enum power_supply_charge_behaviour behaviour, + char *buf); + +int power_supply_charge_behaviour_parse(unsigned int available_behaviours, const char *buf); +#endif + #endif /* __LINUX_POWER_SUPPLY_H__ */ -- cgit v1.2.3 From 37ae5a0f5287a52cf51242e76ccf198d02ffe495 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 18 Dec 2021 18:41:56 +0900 Subject: block: use "unsigned long" for blk_validate_block_size(). Since lo_simple_ioctl(LOOP_SET_BLOCK_SIZE) and ioctl(NBD_SET_BLKSIZE) pass user-controlled "unsigned long arg" to blk_validate_block_size(), "unsigned long" should be used for validation. Signed-off-by: Tetsuo Handa Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/9ecbf057-4375-c2db-ab53-e4cc0dff953d@i-love.sakura.ne.jp Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c80cfaefc0a8..bb5fb7282e6e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -45,7 +45,7 @@ struct blk_crypto_profile; */ #define BLKCG_MAX_POLS 6 -static inline int blk_validate_block_size(unsigned int bsize) +static inline int blk_validate_block_size(unsigned long bsize) { if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) return -EINVAL; -- cgit v1.2.3 From dcce50e6cc4d86a63dc0a9a6ee7d4f948ccd53a1 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 8 Nov 2021 14:35:59 -0800 Subject: compiler.h: Fix annotation macro misplacement with Clang When building with Clang and CONFIG_TRACE_BRANCH_PROFILING, there are a lot of unreachable warnings, like: arch/x86/kernel/traps.o: warning: objtool: handle_xfd_event()+0x134: unreachable instruction Without an input to the inline asm, 'volatile' is ignored for some reason and Clang feels free to move the reachable() annotation away from its intended location. Fix that by re-adding the counter value to the inputs. Fixes: f1069a8756b9 ("compiler.h: Avoid using inline asm operand modifiers") Fixes: c199f64ff93c ("instrumentation.h: Avoid using inline asm operand modifiers") Reported-by: kernel test robot Signed-off-by: Josh Poimboeuf Link: https://lore.kernel.org/r/0417e96909b97a406323409210de7bf13df0b170.1636410380.git.jpoimboe@redhat.com Cc: Peter Zijlstra Cc: x86@kernel.org Cc: Vasily Gorbik Cc: Miroslav Benes --- include/linux/compiler.h | 4 ++-- include/linux/instrumentation.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 3d5af56337bd..429dcebe2b99 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -121,7 +121,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, asm volatile(__stringify_label(c) ":\n\t" \ ".pushsection .discard.reachable\n\t" \ ".long " __stringify_label(c) "b - .\n\t" \ - ".popsection\n\t"); \ + ".popsection\n\t" : : "i" (c)); \ }) #define annotate_reachable() __annotate_reachable(__COUNTER__) @@ -129,7 +129,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, asm volatile(__stringify_label(c) ":\n\t" \ ".pushsection .discard.unreachable\n\t" \ ".long " __stringify_label(c) "b - .\n\t" \ - ".popsection\n\t"); \ + ".popsection\n\t" : : "i" (c)); \ }) #define annotate_unreachable() __annotate_unreachable(__COUNTER__) diff --git a/include/linux/instrumentation.h b/include/linux/instrumentation.h index fa2cd8c63dcc..24359b4a9605 100644 --- a/include/linux/instrumentation.h +++ b/include/linux/instrumentation.h @@ -11,7 +11,7 @@ asm volatile(__stringify(c) ": nop\n\t" \ ".pushsection .discard.instr_begin\n\t" \ ".long " __stringify(c) "b - .\n\t" \ - ".popsection\n\t"); \ + ".popsection\n\t" : : "i" (c)); \ }) #define instrumentation_begin() __instrumentation_begin(__COUNTER__) @@ -50,7 +50,7 @@ asm volatile(__stringify(c) ": nop\n\t" \ ".pushsection .discard.instr_end\n\t" \ ".long " __stringify(c) "b - .\n\t" \ - ".popsection\n\t"); \ + ".popsection\n\t" : : "i" (c)); \ }) #define instrumentation_end() __instrumentation_end(__COUNTER__) #else -- cgit v1.2.3 From f857acfc457ea63fa5b862d77f055665d863acfe Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 17 Nov 2021 14:53:48 -0700 Subject: lib/scatterlist: cleanup macros into static inline functions Convert the sg_is_chain(), sg_is_last() and sg_chain_ptr() macros into static inline functions. There's no reason for these to be macros and static inline are generally preferred these days. Also introduce the SG_PAGE_LINK_MASK define so the P2PDMA work, which is adding another bit to this mask, can do so more easily. Suggested-by: Jason Gunthorpe Signed-off-by: Logan Gunthorpe Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- include/linux/scatterlist.h | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 266754a55327..7ff9d6386c12 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -69,10 +69,27 @@ struct sg_append_table { * a valid sg entry, or whether it points to the start of a new scatterlist. * Those low bits are there for everyone! (thanks mason :-) */ -#define sg_is_chain(sg) ((sg)->page_link & SG_CHAIN) -#define sg_is_last(sg) ((sg)->page_link & SG_END) -#define sg_chain_ptr(sg) \ - ((struct scatterlist *) ((sg)->page_link & ~(SG_CHAIN | SG_END))) +#define SG_PAGE_LINK_MASK (SG_CHAIN | SG_END) + +static inline unsigned int __sg_flags(struct scatterlist *sg) +{ + return sg->page_link & SG_PAGE_LINK_MASK; +} + +static inline struct scatterlist *sg_chain_ptr(struct scatterlist *sg) +{ + return (struct scatterlist *)(sg->page_link & ~SG_PAGE_LINK_MASK); +} + +static inline bool sg_is_chain(struct scatterlist *sg) +{ + return __sg_flags(sg) & SG_CHAIN; +} + +static inline bool sg_is_last(struct scatterlist *sg) +{ + return __sg_flags(sg) & SG_END; +} /** * sg_assign_page - Assign a given page to an SG entry @@ -92,7 +109,7 @@ static inline void sg_assign_page(struct scatterlist *sg, struct page *page) * In order for the low bit stealing approach to work, pages * must be aligned at a 32-bit boundary as a minimum. */ - BUG_ON((unsigned long) page & (SG_CHAIN | SG_END)); + BUG_ON((unsigned long)page & SG_PAGE_LINK_MASK); #ifdef CONFIG_DEBUG_SG BUG_ON(sg_is_chain(sg)); #endif @@ -126,7 +143,7 @@ static inline struct page *sg_page(struct scatterlist *sg) #ifdef CONFIG_DEBUG_SG BUG_ON(sg_is_chain(sg)); #endif - return (struct page *)((sg)->page_link & ~(SG_CHAIN | SG_END)); + return (struct page *)((sg)->page_link & ~SG_PAGE_LINK_MASK); } /** -- cgit v1.2.3 From 365481e42a8a95c55e43e8cc236138718e762e7b Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Tue, 7 Dec 2021 17:50:11 -0800 Subject: driver core: auxiliary bus: Add driver data helpers Adds get/set driver data helpers for auxiliary devices. Reviewed-by: Mark Gross Reviewed-by: Andy Shevchenko Signed-off-by: David E. Box Link: https://lore.kernel.org/r/20211208015015.891275-3-david.e.box@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/auxiliary_bus.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h index e6d8b5c16226..de21d9d24a95 100644 --- a/include/linux/auxiliary_bus.h +++ b/include/linux/auxiliary_bus.h @@ -188,6 +188,16 @@ struct auxiliary_driver { const struct auxiliary_device_id *id_table; }; +static inline void *auxiliary_get_drvdata(struct auxiliary_device *auxdev) +{ + return dev_get_drvdata(&auxdev->dev); +} + +static inline void auxiliary_set_drvdata(struct auxiliary_device *auxdev, void *data) +{ + dev_set_drvdata(&auxdev->dev, data); +} + static inline struct auxiliary_device *to_auxiliary_dev(struct device *dev) { return container_of(dev, struct auxiliary_device, dev); -- cgit v1.2.3 From b398123bff3bcbc1facb0f29bf6e7b9f1bc55931 Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Wed, 15 Dec 2021 10:13:48 +0800 Subject: efi: apply memblock cap after memblock_add() On arm64, during kdump kernel saves vmcore, it runs into the following bug: ... [ 15.148919] usercopy: Kernel memory exposure attempt detected from SLUB object 'kmem_cache_node' (offset 0, size 4096)! [ 15.159707] ------------[ cut here ]------------ [ 15.164311] kernel BUG at mm/usercopy.c:99! [ 15.168482] Internal error: Oops - BUG: 0 [#1] SMP [ 15.173261] Modules linked in: xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 sha1_ce sbsa_gwdt ast i2c_algo_bit drm_vram_helper drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops cec drm_ttm_helper ttm drm nvme nvme_core xgene_hwmon i2c_designware_platform i2c_designware_core dm_mirror dm_region_hash dm_log dm_mod overlay squashfs zstd_decompress loop [ 15.206186] CPU: 0 PID: 542 Comm: cp Not tainted 5.16.0-rc4 #1 [ 15.212006] Hardware name: GIGABYTE R272-P30-JG/MP32-AR0-JG, BIOS F12 (SCP: 1.5.20210426) 05/13/2021 [ 15.221125] pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 15.228073] pc : usercopy_abort+0x9c/0xa0 [ 15.232074] lr : usercopy_abort+0x9c/0xa0 [ 15.236070] sp : ffff8000121abba0 [ 15.239371] x29: ffff8000121abbb0 x28: 0000000000003000 x27: 0000000000000000 [ 15.246494] x26: 0000000080000400 x25: 0000ffff885c7000 x24: 0000000000000000 [ 15.253617] x23: 000007ff80400000 x22: ffff07ff80401000 x21: 0000000000000001 [ 15.260739] x20: 0000000000001000 x19: ffff07ff80400000 x18: ffffffffffffffff [ 15.267861] x17: 656a626f2042554c x16: 53206d6f72662064 x15: 6574636574656420 [ 15.274983] x14: 74706d6574746120 x13: 2129363930342065 x12: 7a6973202c302074 [ 15.282105] x11: ffffc8b041d1b148 x10: 00000000ffff8000 x9 : ffffc8b04012812c [ 15.289228] x8 : 00000000ffff7fff x7 : ffffc8b041d1b148 x6 : 0000000000000000 [ 15.296349] x5 : 0000000000000000 x4 : 0000000000007fff x3 : 0000000000000000 [ 15.303471] x2 : 0000000000000000 x1 : ffff07ff8c064800 x0 : 000000000000006b [ 15.310593] Call trace: [ 15.313027] usercopy_abort+0x9c/0xa0 [ 15.316677] __check_heap_object+0xd4/0xf0 [ 15.320762] __check_object_size.part.0+0x160/0x1e0 [ 15.325628] __check_object_size+0x2c/0x40 [ 15.329711] copy_oldmem_page+0x7c/0x140 [ 15.333623] read_from_oldmem.part.0+0xfc/0x1c0 [ 15.338142] __read_vmcore.constprop.0+0x23c/0x350 [ 15.342920] read_vmcore+0x28/0x34 [ 15.346309] proc_reg_read+0xb4/0xf0 [ 15.349871] vfs_read+0xb8/0x1f0 [ 15.353088] ksys_read+0x74/0x100 [ 15.356390] __arm64_sys_read+0x28/0x34 ... This bug introduced by commit b261dba2fdb2 ("arm64: kdump: Remove custom linux,usable-memory-range handling"), which moves memblock_cap_memory_range() to fdt, but it breaches the rules that memblock_cap_memory_range() should come after memblock_add() etc as said in commit e888fa7bb882 ("memblock: Check memory add/cap ordering"). As a consequence, the virtual address set up by copy_oldmem_page() does not bail out from the test of virt_addr_valid() in check_heap_object(), and finally hits the BUG_ON(). Since memblock allocator has no idea about when the memblock is fully populated, while efi_init() is aware, so tackling this issue by calling the interface early_init_dt_check_for_usable_mem_range() exposed by of/fdt. Fixes: b261dba2fdb2 ("arm64: kdump: Remove custom linux,usable-memory-range handling") Signed-off-by: Pingfan Liu Cc: Rob Herring Cc: Zhen Lei Cc: Catalin Marinas Cc: Will Deacon Cc: Andrew Morton Cc: Mike Rapoport Cc: Geert Uytterhoeven Cc: Frank Rowand Cc: Ard Biesheuvel Cc: Nick Terrell Cc: linux-arm-kernel@lists.infradead.org To: devicetree@vger.kernel.org To: linux-efi@vger.kernel.org Acked-by: Ard Biesheuvel Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20211215021348.8766-1-kernelfans@gmail.com --- include/linux/of_fdt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index cf48983d3c86..ad09beb6d13c 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -62,6 +62,7 @@ extern int early_init_dt_scan_chosen(unsigned long node, const char *uname, int depth, void *data); extern int early_init_dt_scan_memory(unsigned long node, const char *uname, int depth, void *data); +extern void early_init_dt_check_for_usable_mem_range(void); extern int early_init_dt_scan_chosen_stdout(void); extern void early_init_fdt_scan_reserved_mem(void); extern void early_init_fdt_reserve_self(void); @@ -86,6 +87,7 @@ extern void unflatten_and_copy_device_tree(void); extern void early_init_devtree(void *); extern void early_get_first_memblock_info(void *, phys_addr_t *); #else /* CONFIG_OF_EARLY_FLATTREE */ +static inline void early_init_dt_check_for_usable_mem_range(void) {} static inline int early_init_dt_scan_chosen_stdout(void) { return -ENODEV; } static inline void early_init_fdt_scan_reserved_mem(void) {} static inline void early_init_fdt_reserve_self(void) {} -- cgit v1.2.3 From f2f8115fe8b390af27d013411045bd712a812103 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Tue, 21 Dec 2021 15:17:56 +0200 Subject: memory: omap-gpmc: Use a compatible match table when checking for NAND controller As more compatibles can be added to the GPMC NAND controller driver use a compatible match table. Signed-off-by: Roger Quadros Acked-by: Miquel Raynal Link: https://lore.kernel.org/r/20211221131757.2030-4-rogerq@kernel.org [krzysztof: remove "is_nand" variable] Signed-off-by: Krzysztof Kozlowski --- include/linux/platform_data/mtd-nand-omap2.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/mtd-nand-omap2.h b/include/linux/platform_data/mtd-nand-omap2.h index de6ada739121..92f011805ad4 100644 --- a/include/linux/platform_data/mtd-nand-omap2.h +++ b/include/linux/platform_data/mtd-nand-omap2.h @@ -7,6 +7,7 @@ #define _MTD_NAND_OMAP2_H #include +#include #define GPMC_BCH_NUM_REMAINDER 8 @@ -61,4 +62,10 @@ struct gpmc_nand_regs { void __iomem *gpmc_bch_result5[GPMC_BCH_NUM_REMAINDER]; void __iomem *gpmc_bch_result6[GPMC_BCH_NUM_REMAINDER]; }; -#endif + +static const struct of_device_id omap_nand_ids[] = { + { .compatible = "ti,omap2-nand", }, + {}, +}; + +#endif /* _MTD_NAND_OMAP2_H */ -- cgit v1.2.3 From d7f55471db2719629f773c2d6b5742a69595bfd3 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Fri, 17 Dec 2021 10:07:54 +0800 Subject: memblock: fix memblock_phys_alloc() section mismatch error Fix modpost Section mismatch error in memblock_phys_alloc() [...] WARNING: modpost: vmlinux.o(.text.unlikely+0x1dcc): Section mismatch in reference from the function memblock_phys_alloc() to the function .init.text:memblock_phys_alloc_range() The function memblock_phys_alloc() references the function __init memblock_phys_alloc_range(). This is often because memblock_phys_alloc lacks a __init annotation or the annotation of memblock_phys_alloc_range is wrong. ERROR: modpost: Section mismatches detected. Set CONFIG_SECTION_MISMATCH_WARN_ONLY=y to allow them. [...] memblock_phys_alloc() is a one-line wrapper, make it __always_inline to avoid these section mismatches. Reported-by: k2ci Suggested-by: Mike Rapoport Signed-off-by: Jackie Liu [rppt: slightly massaged changelog ] Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/20211217020754.2874872-1-liu.yun@linux.dev --- include/linux/memblock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 8adcf1fa8096..9dc7cb239d21 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -405,8 +405,8 @@ phys_addr_t memblock_alloc_range_nid(phys_addr_t size, phys_addr_t end, int nid, bool exact_nid); phys_addr_t memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid); -static inline phys_addr_t memblock_phys_alloc(phys_addr_t size, - phys_addr_t align) +static __always_inline phys_addr_t memblock_phys_alloc(phys_addr_t size, + phys_addr_t align) { return memblock_phys_alloc_range(size, align, 0, MEMBLOCK_ALLOC_ACCESSIBLE); -- cgit v1.2.3 From 6fd3c510ee4b37f2f9fe3d3cafbfa459e15c5e11 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 22 Dec 2021 13:15:32 -0800 Subject: bio.h: fix kernel-doc warnings Fix all kernel-doc warnings in : include/linux/bio.h:136: warning: Function parameter or member 'nbytes' not described in 'bio_advance' include/linux/bio.h:136: warning: Excess function parameter 'bytes' description in 'bio_advance' include/linux/bio.h:391: warning: No description found for return value of 'bio_next_split' Signed-off-by: Randy Dunlap Cc: Kent Overstreet Cc: Jens Axboe Cc: linux-block@vger.kernel.org Link: https://lore.kernel.org/r/20211222211532.24060-1-rdunlap@infradead.org Signed-off-by: Jens Axboe --- include/linux/bio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index fe6bdfbbef66..0a41efe02208 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -124,7 +124,7 @@ void __bio_advance(struct bio *, unsigned bytes); /** * bio_advance - increment/complete a bio by some number of bytes * @bio: bio to advance - * @bytes: number of bytes to complete + * @nbytes: number of bytes to complete * * This updates bi_sector, bi_size and bi_idx; if the number of bytes to * complete doesn't align with a bvec boundary, then bv_len and bv_offset will @@ -332,7 +332,7 @@ extern struct bio *bio_split(struct bio *bio, int sectors, * @gfp: gfp mask * @bs: bio set to allocate from * - * Returns a bio representing the next @sectors of @bio - if the bio is smaller + * Return: a bio representing the next @sectors of @bio - if the bio is smaller * than @sectors, returns the original bio unchanged. */ static inline struct bio *bio_next_split(struct bio *bio, int sectors, -- cgit v1.2.3 From a16c7246368db8935652c805bc446928d0e1c0aa Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 22 Dec 2021 13:52:39 -0800 Subject: block: remove unnecessary trailing '\' While harmless, the blank line is certainly not intended to be part of the rq_list_for_each() macro. Remove it. Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20211222215239.1768164-1-kbusch@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bb5fb7282e6e..22746b2d6825 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1363,7 +1363,7 @@ struct io_comp_batch { }) #define rq_list_for_each(listptr, pos) \ - for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) \ + for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) #define rq_list_next(rq) (rq)->rq_next #define rq_list_empty(list) ((list) == (struct request *) NULL) -- cgit v1.2.3 From 6e1fcab00a23f7fe9f4fe9704905a790efa1eeab Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 20 Dec 2021 19:21:26 +0800 Subject: scsi: block: pm: Always set request queue runtime active in blk_post_runtime_resume() John Garry reported a deadlock that occurs when trying to access a runtime-suspended SATA device. For obscure reasons, the rescan procedure causes the link to be hard-reset, which disconnects the device. The rescan tries to carry out a runtime resume when accessing the device. scsi_rescan_device() holds the SCSI device lock and won't release it until it can put commands onto the device's block queue. This can't happen until the queue is successfully runtime-resumed or the device is unregistered. But the runtime resume fails because the device is disconnected, and __scsi_remove_device() can't do the unregistration because it can't get the device lock. The best way to resolve this deadlock appears to be to allow the block queue to start running again even after an unsuccessful runtime resume. The idea is that the driver or the SCSI error handler will need to be able to use the queue to resolve the runtime resume failure. This patch removes the err argument to blk_post_runtime_resume() and makes the routine act as though the resume was successful always. This fixes the deadlock. Link: https://lore.kernel.org/r/1639999298-244569-4-git-send-email-chenxiang66@hisilicon.com Fixes: e27829dc92e5 ("scsi: serialize ->rescan against ->remove") Reported-and-tested-by: John Garry Reviewed-by: Bart Van Assche Signed-off-by: Alan Stern Signed-off-by: Xiang Chen Signed-off-by: Martin K. Petersen --- include/linux/blk-pm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-pm.h b/include/linux/blk-pm.h index b80c65aba249..2580e05a8ab6 100644 --- a/include/linux/blk-pm.h +++ b/include/linux/blk-pm.h @@ -14,7 +14,7 @@ extern void blk_pm_runtime_init(struct request_queue *q, struct device *dev); extern int blk_pre_runtime_suspend(struct request_queue *q); extern void blk_post_runtime_suspend(struct request_queue *q, int err); extern void blk_pre_runtime_resume(struct request_queue *q); -extern void blk_post_runtime_resume(struct request_queue *q, int err); +extern void blk_post_runtime_resume(struct request_queue *q); extern void blk_set_runtime_active(struct request_queue *q); #else static inline void blk_pm_runtime_init(struct request_queue *q, -- cgit v1.2.3 From 30be4551f9e26292599e666985119a5b559a2e4a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 22 Dec 2021 18:32:56 +0200 Subject: wwan: Replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. Signed-off-by: Andy Shevchenko Reviewed-by: Sergey Ryazanov Signed-off-by: David S. Miller --- include/linux/wwan.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wwan.h b/include/linux/wwan.h index e143c88bf4b0..afb3334ec8c5 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -4,12 +4,9 @@ #ifndef __WWAN_H #define __WWAN_H -#include -#include #include -#include -#include #include +#include /** * enum wwan_port_type - WWAN port types @@ -37,6 +34,10 @@ enum wwan_port_type { WWAN_PORT_UNKNOWN, }; +struct device; +struct file; +struct netlink_ext_ack; +struct sk_buff; struct wwan_port; /** struct wwan_port_ops - The WWAN port operations -- cgit v1.2.3 From 7da1d1ddd1f02e5de7497a0c849256912652fb6c Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:35 -0500 Subject: cuda/pmu: Make find_via_cuda/pmu init functions Make `find_via_cuda` and `find_via_pmu` initialization functions. Previously, their definitions in `drivers/macintosh/via-cuda.h` include the `__init` attribute but their alternative definitions in `arch/powerpc/powermac/sectup./c` and prototypes in `include/linux/ cuda.h` and `include/linux/pmu.h` do not use the `__init` macro. Since, only initialization functions call `find_via_cuda` and `find_via_pmu` it is safe to label these functions with `__init`. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-21-nick.child@ibm.com --- include/linux/cuda.h | 2 +- include/linux/pmu.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cuda.h b/include/linux/cuda.h index 45bfe9d61271..daf3e6f98444 100644 --- a/include/linux/cuda.h +++ b/include/linux/cuda.h @@ -12,7 +12,7 @@ #include -extern int find_via_cuda(void); +extern int __init find_via_cuda(void); extern int cuda_request(struct adb_request *req, void (*done)(struct adb_request *), int nbytes, ...); extern void cuda_poll(void); diff --git a/include/linux/pmu.h b/include/linux/pmu.h index 52453a24a24f..c677442d007c 100644 --- a/include/linux/pmu.h +++ b/include/linux/pmu.h @@ -13,7 +13,7 @@ #include -extern int find_via_pmu(void); +extern int __init find_via_pmu(void); extern int pmu_request(struct adb_request *req, void (*done)(struct adb_request *), int nbytes, ...); -- cgit v1.2.3 From 66b354064a35b6379963cba27b5d37a278fc9bd9 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 23 Nov 2021 11:16:00 +0100 Subject: powercap/drivers/dtpm: Remove unused function definition The dtpm.h header file is exporting a function which is not implemented neither needed. Remove it. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20211123101601.2433340-1-daniel.lezcano@linaro.org --- include/linux/dtpm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h index 2890f6370eb9..d37e5d06a357 100644 --- a/include/linux/dtpm.h +++ b/include/linux/dtpm.h @@ -70,6 +70,4 @@ void dtpm_unregister(struct dtpm *dtpm); int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent); -int dtpm_register_cpu(struct dtpm *parent); - #endif -- cgit v1.2.3 From dd123e62bdedcd3a486e48e883ec63138ec2c14c Mon Sep 17 00:00:00 2001 From: Henning Schild Date: Mon, 13 Dec 2021 13:04:59 +0100 Subject: platform/x86: simatic-ipc: add main driver for Siemens devices This mainly implements detection of these devices and will allow secondary drivers to work on such machines. The identification is DMI-based with a vendor specific way to tell them apart in a reliable way. Drivers for LEDs and Watchdogs will follow to make use of that platform detection. There is also some code to allow secondary drivers to find GPIO memory, that needs to be in place because the pinctrl drivers do not come up. Signed-off-by: Henning Schild Link: https://lore.kernel.org/r/20211213120502.20661-2-henning.schild@siemens.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/simatic-ipc-base.h | 29 +++++++++ include/linux/platform_data/x86/simatic-ipc.h | 72 ++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 include/linux/platform_data/x86/simatic-ipc-base.h create mode 100644 include/linux/platform_data/x86/simatic-ipc.h (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/simatic-ipc-base.h b/include/linux/platform_data/x86/simatic-ipc-base.h new file mode 100644 index 000000000000..62d2bc774067 --- /dev/null +++ b/include/linux/platform_data/x86/simatic-ipc-base.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Siemens SIMATIC IPC drivers + * + * Copyright (c) Siemens AG, 2018-2021 + * + * Authors: + * Henning Schild + * Gerd Haeussler + */ + +#ifndef __PLATFORM_DATA_X86_SIMATIC_IPC_BASE_H +#define __PLATFORM_DATA_X86_SIMATIC_IPC_BASE_H + +#include + +#define SIMATIC_IPC_DEVICE_NONE 0 +#define SIMATIC_IPC_DEVICE_227D 1 +#define SIMATIC_IPC_DEVICE_427E 2 +#define SIMATIC_IPC_DEVICE_127E 3 +#define SIMATIC_IPC_DEVICE_227E 4 + +struct simatic_ipc_platform { + u8 devmode; +}; + +u32 simatic_ipc_get_membase0(unsigned int p2sb); + +#endif /* __PLATFORM_DATA_X86_SIMATIC_IPC_BASE_H */ diff --git a/include/linux/platform_data/x86/simatic-ipc.h b/include/linux/platform_data/x86/simatic-ipc.h new file mode 100644 index 000000000000..f3b76b39776b --- /dev/null +++ b/include/linux/platform_data/x86/simatic-ipc.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Siemens SIMATIC IPC drivers + * + * Copyright (c) Siemens AG, 2018-2021 + * + * Authors: + * Henning Schild + * Gerd Haeussler + */ + +#ifndef __PLATFORM_DATA_X86_SIMATIC_IPC_H +#define __PLATFORM_DATA_X86_SIMATIC_IPC_H + +#include +#include + +#define SIMATIC_IPC_DMI_ENTRY_OEM 129 +/* binary type */ +#define SIMATIC_IPC_DMI_TYPE 0xff +#define SIMATIC_IPC_DMI_GROUP 0x05 +#define SIMATIC_IPC_DMI_ENTRY 0x02 +#define SIMATIC_IPC_DMI_TID 0x02 + +enum simatic_ipc_station_ids { + SIMATIC_IPC_INVALID_STATION_ID = 0, + SIMATIC_IPC_IPC227D = 0x00000501, + SIMATIC_IPC_IPC427D = 0x00000701, + SIMATIC_IPC_IPC227E = 0x00000901, + SIMATIC_IPC_IPC277E = 0x00000902, + SIMATIC_IPC_IPC427E = 0x00000A01, + SIMATIC_IPC_IPC477E = 0x00000A02, + SIMATIC_IPC_IPC127E = 0x00000D01, +}; + +static inline u32 simatic_ipc_get_station_id(u8 *data, int max_len) +{ + struct { + u8 type; /* type (0xff = binary) */ + u8 len; /* len of data entry */ + u8 group; + u8 entry; + u8 tid; + __le32 station_id; /* station id (LE) */ + } __packed * data_entry = (void *)data + sizeof(struct dmi_header); + + while ((u8 *)data_entry < data + max_len) { + if (data_entry->type == SIMATIC_IPC_DMI_TYPE && + data_entry->len == sizeof(*data_entry) && + data_entry->group == SIMATIC_IPC_DMI_GROUP && + data_entry->entry == SIMATIC_IPC_DMI_ENTRY && + data_entry->tid == SIMATIC_IPC_DMI_TID) { + return le32_to_cpu(data_entry->station_id); + } + data_entry = (void *)((u8 *)(data_entry) + data_entry->len); + } + + return SIMATIC_IPC_INVALID_STATION_ID; +} + +static inline void +simatic_ipc_find_dmi_entry_helper(const struct dmi_header *dh, void *_data) +{ + u32 *id = _data; + + if (dh->type != SIMATIC_IPC_DMI_ENTRY_OEM) + return; + + *id = simatic_ipc_get_station_id((u8 *)dh, dh->length); +} + +#endif /* __PLATFORM_DATA_X86_SIMATIC_IPC_H */ -- cgit v1.2.3 From b86947b52f0d0e5b6e6f0510933ca13aad266e47 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Fri, 24 Dec 2021 10:10:29 +0800 Subject: ASoC/soundwire: intel: simplify callbacks for params/hw_free We don't really need to pass a substream to the callback, we only need the direction. No functionality change, only simplification to enable improve suspend with paused streams. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Ranjani Sridharan Signed-off-by: Bard Liao Acked-By: Vinod Koul Link: https://lore.kernel.org/r/20211224021034.26635-3-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- include/linux/soundwire/sdw_intel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 8a463b8fc12a..67e0d3e750b5 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -92,7 +92,7 @@ * firmware. */ struct sdw_intel_stream_params_data { - struct snd_pcm_substream *substream; + int stream; struct snd_soc_dai *dai; struct snd_pcm_hw_params *hw_params; int link_id; @@ -105,7 +105,7 @@ struct sdw_intel_stream_params_data { * firmware. */ struct sdw_intel_stream_free_data { - struct snd_pcm_substream *substream; + int stream; struct snd_soc_dai *dai; int link_id; }; -- cgit v1.2.3 From 54bf7fa3efd08eea03e4bac04e188ee3db6173a7 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 13 Dec 2021 17:11:45 +0100 Subject: ima: Fix undefined arch_ima_get_secureboot() and co Currently arch_ima_get_secureboot() and arch_get_ima_policy() are defined only when CONFIG_IMA is set, and this makes any code calling those functions without CONFIG_IMA fail. Move the declaration and the dummy definition of those functions outside ifdef-CONFIG_IMA block for fixing the undefined symbols. Signed-off-by: Takashi Iwai [zohar@linux.ibm.com: removed in-tree/out-of-tree comment in patch description] Reviewed-by: Petr Vorel Signed-off-by: Mimi Zohar --- include/linux/ima.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ima.h b/include/linux/ima.h index b6ab66a546ae..426b1744215e 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -50,21 +50,6 @@ static inline void ima_appraise_parse_cmdline(void) {} extern void ima_add_kexec_buffer(struct kimage *image); #endif -#ifdef CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT -extern bool arch_ima_get_secureboot(void); -extern const char * const *arch_get_ima_policy(void); -#else -static inline bool arch_ima_get_secureboot(void) -{ - return false; -} - -static inline const char * const *arch_get_ima_policy(void) -{ - return NULL; -} -#endif - #else static inline enum hash_algo ima_get_current_hash_algo(void) { @@ -155,6 +140,21 @@ static inline int ima_measure_critical_data(const char *event_label, #endif /* CONFIG_IMA */ +#ifdef CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT +extern bool arch_ima_get_secureboot(void); +extern const char * const *arch_get_ima_policy(void); +#else +static inline bool arch_ima_get_secureboot(void) +{ + return false; +} + +static inline const char * const *arch_get_ima_policy(void) +{ + return NULL; +} +#endif + #ifndef CONFIG_IMA_KEXEC struct kimage; -- cgit v1.2.3 From 94ab10dd42a70acc5208a41325617e3d9cf81a70 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 24 Dec 2021 21:12:48 -0800 Subject: mm: delete unsafe BUG from page_cache_add_speculative() It is not easily reproducible, but on 5.16-rc I have several times hit the VM_BUG_ON_PAGE(PageTail(page), page) in page_cache_add_speculative(): usually from filemap_get_read_batch() for an ext4 read, yesterday from next_uptodate_page() from filemap_map_pages() for a shmem fault. That BUG used to be placed where page_ref_add_unless() had succeeded, but now it is placed before folio_ref_add_unless() is attempted: that is not safe, since it is only the acquired reference which makes the page safe from racing THP collapse or split. We could keep the BUG, checking PageTail only when folio_ref_try_add_rcu() has succeeded; but I don't think it adds much value - just delete it. Link: https://lkml.kernel.org/r/8b98fc6f-3439-8614-c3f3-945c659a1aba@google.com Fixes: 020853b6f5ea ("mm: Add folio_try_get_rcu()") Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Reviewed-by: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: William Kucharski Cc: Christoph Hellwig Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 605246452305..d150a9082b31 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -285,7 +285,6 @@ static inline struct inode *folio_inode(struct folio *folio) static inline bool page_cache_add_speculative(struct page *page, int count) { - VM_BUG_ON_PAGE(PageTail(page), page); return folio_ref_try_add_rcu((struct folio *)page, count); } -- cgit v1.2.3 From 595ec1973c276f6c0c1de8aca5eef8dfd81f9b49 Mon Sep 17 00:00:00 2001 From: Thibaut Sautereau Date: Fri, 24 Dec 2021 21:12:51 -0800 Subject: mm/page_alloc: fix __alloc_size attribute for alloc_pages_exact_nid The second parameter of alloc_pages_exact_nid is the one indicating the size of memory pointed by the returned pointer. Link: https://lkml.kernel.org/r/YbjEgwhn4bGblp//@coeus Fixes: abd58f38dfb4 ("mm/page_alloc: add __alloc_size attributes for better bounds checking") Signed-off-by: Thibaut Sautereau Acked-by: Kees Cook Cc: Daniel Micay Cc: Levente Polyak Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b976c4177299..8fcc38467af6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -624,7 +624,7 @@ extern unsigned long get_zeroed_page(gfp_t gfp_mask); void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1); void free_pages_exact(void *virt, size_t size); -__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(1); +__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask), 0) -- cgit v1.2.3 From 4fb0abfee424b05f0ec6d2d09e38f04ee2b82a8a Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 8 Nov 2021 15:51:21 -0600 Subject: x86/amd_nb: Add AMD Family 19h Models (10h-1Fh) and (A0h-AFh) PCI IDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the new PCI Device IDs to support new generation of AMD 19h family of processors. Signed-off-by: Yazen Ghannam Signed-off-by: Babu Moger Acked-by: Krzysztof Wilczyński Acked-by: Borislav Petkov Acked-by: Bjorn Helgaas # pci_ids.h Link: https://lore.kernel.org/r/163640828133.955062.18349019796157170473.stgit@bmoger-ubuntu Signed-off-by: Guenter Roeck --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 011f2f1ea5bb..b5248f27910e 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -555,6 +555,7 @@ #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F3 0x144b #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F3 0x1443 #define PCI_DEVICE_ID_AMD_19H_DF_F3 0x1653 +#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F3 0x14b0 #define PCI_DEVICE_ID_AMD_19H_M40H_DF_F3 0x167c #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F3 0x166d #define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703 -- cgit v1.2.3 From 11a24ca7e34d968991a7d437b950d1924396bd81 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 25 Nov 2021 03:08:38 +0100 Subject: hwmon: (ntc_thermistor) Merge platform data into driver Platform data is supposed to be used with "board files", device descriptions in C. Since the introduction of the NTC driver in 2011, no such platforms have been submitted to the Linux kernel, and their use is strongly discouraged in favor of Device Tree, ACPI or as last resort software firmware nodes. Drop the external header and copy the platform data into the driver file. Cc: Peter Rosin Cc: Chris Lesiak Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20211125020841.3616359-2-linus.walleij@linaro.org Signed-off-by: Guenter Roeck --- include/linux/platform_data/ntc_thermistor.h | 50 ---------------------------- 1 file changed, 50 deletions(-) delete mode 100644 include/linux/platform_data/ntc_thermistor.h (limited to 'include/linux') diff --git a/include/linux/platform_data/ntc_thermistor.h b/include/linux/platform_data/ntc_thermistor.h deleted file mode 100644 index b324d03e580c..000000000000 --- a/include/linux/platform_data/ntc_thermistor.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * ntc_thermistor.h - NTC Thermistors - * - * Copyright (C) 2010 Samsung Electronics - * MyungJoo Ham - */ -#ifndef _LINUX_NTC_H -#define _LINUX_NTC_H - -struct iio_channel; - -enum ntc_thermistor_type { - TYPE_B57330V2103, - TYPE_B57891S0103, - TYPE_NCPXXWB473, - TYPE_NCPXXWF104, - TYPE_NCPXXWL333, - TYPE_NCPXXXH103, -}; - -struct ntc_thermistor_platform_data { - /* - * One (not both) of read_uV and read_ohm should be provided and only - * one of the two should be provided. - * Both functions should return negative value for an error case. - * - * pullup_uV, pullup_ohm, pulldown_ohm, and connect are required to use - * read_uV() - * - * How to setup pullup_ohm, pulldown_ohm, and connect is - * described at Documentation/hwmon/ntc_thermistor.rst - * - * pullup/down_ohm: 0 for infinite / not-connected - * - * chan: iio_channel pointer to communicate with the ADC which the - * thermistor is using for conversion of the analog values. - */ - int (*read_uv)(struct ntc_thermistor_platform_data *); - unsigned int pullup_uv; - - unsigned int pullup_ohm; - unsigned int pulldown_ohm; - enum { NTC_CONNECTED_POSITIVE, NTC_CONNECTED_GROUND } connect; - struct iio_channel *chan; - - int (*read_ohm)(void); -}; - -#endif /* _LINUX_NTC_H */ -- cgit v1.2.3 From 130d168866a11829b844ffdb19b9aefe384f754c Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Thu, 16 Dec 2021 16:42:57 +0100 Subject: hwmon: prefix kernel-doc comments for structs with struct The command ./scripts/kernel-doc -none include/linux/hwmon.h warns: include/linux/hwmon.h:406: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Channel information include/linux/hwmon.h:425: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Chip configuration Address those kernel-doc warnings by prefixing kernel-doc descriptions for structs with the keyword 'struct'. Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20211216154257.26758-1-lukas.bulwahn@gmail.com Signed-off-by: Guenter Roeck --- include/linux/hwmon.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 1e8d6ea8992e..fad1f1df26df 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -403,7 +403,7 @@ struct hwmon_ops { }; /** - * Channel information + * struct hwmon_channel_info - Channel information * @type: Channel type. * @config: Pointer to NULL-terminated list of channel parameters. * Use for per-channel attributes. @@ -422,7 +422,7 @@ struct hwmon_channel_info { }) /** - * Chip configuration + * struct hwmon_chip_info - Chip configuration * @ops: Pointer to hwmon operations. * @info: Null-terminated list of channel information. */ -- cgit v1.2.3 From ee6d3dd4ed48ab24b74bab3c3977b8218518247d Mon Sep 17 00:00:00 2001 From: Wedson Almeida Filho Date: Fri, 24 Dec 2021 23:13:45 +0000 Subject: driver core: make kobj_type constant. This way instances of kobj_type (which contain function pointers) can be stored in .rodata, which means that they cannot be [easily/accidentally] modified at runtime. Signed-off-by: Wedson Almeida Filho Link: https://lore.kernel.org/r/20211224231345.777370-1-wedsonaf@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/kobject.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kobject.h b/include/linux/kobject.h index c740062b4b1a..683172b2e094 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -66,7 +66,7 @@ struct kobject { struct list_head entry; struct kobject *parent; struct kset *kset; - struct kobj_type *ktype; + const struct kobj_type *ktype; struct kernfs_node *sd; /* sysfs directory entry */ struct kref kref; #ifdef CONFIG_DEBUG_KOBJECT_RELEASE @@ -90,13 +90,13 @@ static inline const char *kobject_name(const struct kobject *kobj) return kobj->name; } -extern void kobject_init(struct kobject *kobj, struct kobj_type *ktype); +extern void kobject_init(struct kobject *kobj, const struct kobj_type *ktype); extern __printf(3, 4) __must_check int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...); extern __printf(4, 5) __must_check int kobject_init_and_add(struct kobject *kobj, - struct kobj_type *ktype, struct kobject *parent, + const struct kobj_type *ktype, struct kobject *parent, const char *fmt, ...); extern void kobject_del(struct kobject *kobj); @@ -217,7 +217,7 @@ static inline void kset_put(struct kset *k) kobject_put(&k->kobj); } -static inline struct kobj_type *get_ktype(struct kobject *kobj) +static inline const struct kobj_type *get_ktype(struct kobject *kobj) { return kobj->ktype; } -- cgit v1.2.3 From 1882de7fc56c2b0ea91dd9fd9922d434fc3feb15 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 22 Dec 2021 12:31:03 +0800 Subject: efi: Introduce EFI_FIRMWARE_MANAGEMENT_CAPSULE_HEADER and corresponding structures Platform Firmware Runtime Update image starts with UEFI headers, and the headers are defined in UEFI specification, but some of them have not been defined in the kernel yet. For example, the header layout of a capsule file looks like this: EFI_CAPSULE_HEADER EFI_FIRMWARE_MANAGEMENT_CAPSULE_HEADER EFI_FIRMWARE_MANAGEMENT_CAPSULE_IMAGE_HEADER EFI_FIRMWARE_IMAGE_AUTHENTICATION These structures would be used by the Platform Firmware Runtime Update driver to parse the format of capsule file to verify if the corresponding version number is valid. In this way, if the user provides an invalid capsule image, the kernel could be used as a guard to reject it, without switching to the Management Mode (which might be costly). EFI_CAPSULE_HEADER has been defined in the kernel, but the other structures have not been defined yet, so do that. Besides, EFI_FIRMWARE_MANAGEMENT_CAPSULE_HEADER and EFI_FIRMWARE_MANAGEMENT_CAPSULE_IMAGE_HEADER are required to be packed in the uefi specification. For this reason, use the __packed attribute to indicate to the compiler that the entire structure can appear misaligned in memory (as suggested by Ard) in case one of them follows the other directly in a capsule header. Acked-by: Ard Biesheuvel Signed-off-by: Chen Yu Signed-off-by: Rafael J. Wysocki --- include/linux/efi.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index dbd39b20e034..80e970f7e6f8 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -148,6 +148,52 @@ typedef struct { u32 imagesize; } efi_capsule_header_t; +/* EFI_FIRMWARE_MANAGEMENT_CAPSULE_HEADER */ +struct efi_manage_capsule_header { + u32 ver; + u16 emb_drv_cnt; + u16 payload_cnt; + /* + * Variable-size array of the size given by the sum of + * emb_drv_cnt and payload_cnt. + */ + u64 offset_list[]; +} __packed; + +/* EFI_FIRMWARE_MANAGEMENT_CAPSULE_IMAGE_HEADER */ +struct efi_manage_capsule_image_header { + u32 ver; + efi_guid_t image_type_id; + u8 image_index; + u8 reserved_bytes[3]; + u32 image_size; + u32 vendor_code_size; + /* hw_ins was introduced in version 2 */ + u64 hw_ins; + /* capsule_support was introduced in version 3 */ + u64 capsule_support; +} __packed; + +/* WIN_CERTIFICATE */ +struct win_cert { + u32 len; + u16 rev; + u16 cert_type; +}; + +/* WIN_CERTIFICATE_UEFI_GUID */ +struct win_cert_uefi_guid { + struct win_cert hdr; + efi_guid_t cert_type; + u8 cert_data[]; +}; + +/* EFI_FIRMWARE_IMAGE_AUTHENTICATION */ +struct efi_image_auth { + u64 mon_count; + struct win_cert_uefi_guid auth_info; +}; + /* * EFI capsule flags */ -- cgit v1.2.3 From cf6299b6101903c31bddb0065804b2121ed510c7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 27 Dec 2021 17:39:24 +0100 Subject: kobject: remove kset from struct kset_uevent_ops callbacks There is no need to pass the pointer to the kset in the struct kset_uevent_ops callbacks as no one uses it, so just remove that pointer entirely. Reviewed-by: Rafael J. Wysocki Reviewed-by: Wedson Almeida Filho Link: https://lore.kernel.org/r/20211227163924.3970661-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/kobject.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kobject.h b/include/linux/kobject.h index 683172b2e094..ad90b49824dc 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -153,10 +153,9 @@ struct kobj_uevent_env { }; struct kset_uevent_ops { - int (* const filter)(struct kset *kset, struct kobject *kobj); - const char *(* const name)(struct kset *kset, struct kobject *kobj); - int (* const uevent)(struct kset *kset, struct kobject *kobj, - struct kobj_uevent_env *env); + int (* const filter)(struct kobject *kobj); + const char *(* const name)(struct kobject *kobj); + int (* const uevent)(struct kobject *kobj, struct kobj_uevent_env *env); }; struct kobj_attribute { -- cgit v1.2.3 From d6b9c679bbac1d1d2fcac64391b4cadb91763a6f Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 12 Nov 2021 14:46:32 -0800 Subject: watchdog: bcm7038_wdt: Support platform data configuration The BCM7038 watchdog driver needs to be able to obtain a specific clock name on BCM63xx platforms which is the "periph" clock ticking at 50MHz. make it possible to specify the clock name to obtain via platform data. Signed-off-by: Florian Fainelli Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20211112224636.395101-4-f.fainelli@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/linux/platform_data/bcm7038_wdt.h | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 include/linux/platform_data/bcm7038_wdt.h (limited to 'include/linux') diff --git a/include/linux/platform_data/bcm7038_wdt.h b/include/linux/platform_data/bcm7038_wdt.h new file mode 100644 index 000000000000..e18cfd9ec8f9 --- /dev/null +++ b/include/linux/platform_data/bcm7038_wdt.h @@ -0,0 +1,8 @@ +#ifndef __BCM7038_WDT_PDATA_H +#define __BCM7038_WDT_PDATA_H + +struct bcm7038_wdt_platform_data { + const char *clk_name; +}; + +#endif /* __BCM7038_WDT_PDATA_H */ -- cgit v1.2.3 From b92e301633f0f454aa1cfedac2e096bb9649b367 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Sat, 18 Dec 2021 16:25:53 +0100 Subject: mfd: ntxec: Change return type of ntxec_reg8 from __be16 to u16 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Register values in NTXEC are big-endian on the I2C bus, but the regmap subsystem handles the conversion between CPU-endian and big-endian data internally. ntxec_reg8 should thus return u16, not __be16. Reported-by: kernel test robot Signed-off-by: Jonathan Neuschäfer Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211218152553.744615-1-j.neuschaefer@gmx.net --- include/linux/mfd/ntxec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/ntxec.h b/include/linux/mfd/ntxec.h index 26ab3b8eb612..cc6f07bfa2b3 100644 --- a/include/linux/mfd/ntxec.h +++ b/include/linux/mfd/ntxec.h @@ -26,7 +26,7 @@ struct ntxec { * This convenience function converts an 8-bit value to 16-bit for use in the * second kind of register. */ -static inline __be16 ntxec_reg8(u8 value) +static inline u16 ntxec_reg8(u8 value) { return value << 8; } -- cgit v1.2.3 From b6459415b384cb829f0b2a4268f211c789f6cf0b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 28 Dec 2021 16:49:13 -0800 Subject: net: Don't include filter.h from net/sock.h sock.h is pretty heavily used (5k objects rebuilt on x86 after it's touched). We can drop the include of filter.h from it and add a forward declaration of struct sk_filter instead. This decreases the number of rebuilt objects when bpf.h is touched from ~5k to ~1k. There's a lot of missing includes this was masking. Primarily in networking tho, this time. Signed-off-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Acked-by: Marc Kleine-Budde Acked-by: Florian Fainelli Acked-by: Nikolay Aleksandrov Acked-by: Stefano Garzarella Link: https://lore.kernel.org/bpf/20211229004913.513372-1-kuba@kernel.org --- include/linux/bpf_local_storage.h | 1 + include/linux/dsa/loop.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 24496bc28e7b..a2b625960ffe 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -8,6 +8,7 @@ #define _BPF_LOCAL_STORAGE_H #include +#include #include #include #include diff --git a/include/linux/dsa/loop.h b/include/linux/dsa/loop.h index 5a3470bcc8a7..b8fef35591aa 100644 --- a/include/linux/dsa/loop.h +++ b/include/linux/dsa/loop.h @@ -2,6 +2,7 @@ #ifndef DSA_LOOP_H #define DSA_LOOP_H +#include #include #include #include -- cgit v1.2.3 From 0fe4b381a59ebc53522fce579b281a67a9e1bee6 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 24 Dec 2021 15:29:15 +0000 Subject: bpf: Allow bpf_local_storage to be used by sleepable programs Other maps like hashmaps are already available to sleepable programs. Sleepable BPF programs run under trace RCU. Allow task, sk and inode storage to be used from sleepable programs. This allows sleepable and non-sleepable programs to provide shareable annotations on kernel objects. Sleepable programs run in trace RCU where as non-sleepable programs run in a normal RCU critical section i.e. __bpf_prog_enter{_sleepable} and __bpf_prog_exit{_sleepable}) (rcu_read_lock or rcu_read_lock_trace). In order to make the local storage maps accessible to both sleepable and non-sleepable programs, one needs to call both call_rcu_tasks_trace and call_rcu to wait for both trace and classical RCU grace periods to expire before freeing memory. Paul's work on call_rcu_tasks_trace allows us to have per CPU queueing for call_rcu_tasks_trace. This behaviour can be achieved by setting rcupdate.rcu_task_enqueue_lim= boot parameter. In light of these new performance changes and to keep the local storage code simple, avoid adding a new flag for sleepable maps / local storage to select the RCU synchronization (trace / classical). Also, update the dereferencing of the pointers to use rcu_derference_check (with either the trace or normal RCU locks held) with a common bpf_rcu_lock_held helper method. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20211224152916.1550677-2-kpsingh@kernel.org --- include/linux/bpf_local_storage.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index a2b625960ffe..37b3906af8b1 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -17,6 +17,9 @@ #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 +#define bpf_rcu_lock_held() \ + (rcu_read_lock_held() || rcu_read_lock_trace_held() || \ + rcu_read_lock_bh_held()) struct bpf_local_storage_map_bucket { struct hlist_head list; raw_spinlock_t lock; @@ -162,4 +165,6 @@ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags); +void bpf_local_storage_free_rcu(struct rcu_head *rcu); + #endif /* _BPF_LOCAL_STORAGE_H */ -- cgit v1.2.3 From aebb51ec3db2a871d74b4afad3f9914812acf120 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 Dec 2021 17:27:42 -0800 Subject: bpf: Invert the dependency between bpf-netns.h and netns/bpf.h netns/bpf.h gets included by netdevice.h (thru net_namespace.h) which in turn gets included in a lot of places. We should keep netns/bpf.h as light-weight as possible. bpf-netns.h seems to contain more implementation details than deserves to be included in a netns header. It needs to pull in uapi/bpf.h to get various enum types. Move enum netns_bpf_attach_type to netns/bpf.h and invert the dependency. This makes netns/bpf.h fit the mold of a struct definition header more clearly, and drops the number of objects rebuilt when uapi/bpf.h is touched from 7.7k to 1.1k. Signed-off-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211230012742.770642-3-kuba@kernel.org --- include/linux/bpf-netns.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h index 722f799c1a2e..413cfa5e4b07 100644 --- a/include/linux/bpf-netns.h +++ b/include/linux/bpf-netns.h @@ -3,15 +3,9 @@ #define _BPF_NETNS_H #include +#include #include -enum netns_bpf_attach_type { - NETNS_BPF_INVALID = -1, - NETNS_BPF_FLOW_DISSECTOR = 0, - NETNS_BPF_SK_LOOKUP, - MAX_NETNS_BPF_ATTACH_TYPE -}; - static inline enum netns_bpf_attach_type to_netns_bpf_attach_type(enum bpf_attach_type attach_type) { -- cgit v1.2.3 From 730b49aac426e1e8016d3c2dd6b407e500423821 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Thu, 23 Dec 2021 11:24:22 +0300 Subject: usb: typec: port-mapper: Convert to the component framework Instead of trying to keep track of the connections to the USB Type-C connectors separately, letting the component framework take care of that. From now on every USB Type-C connector will register itself as "aggregate" - component master - and anything that can be connected to it inside the system can then simply register itself as a generic component. The matching of the components and the connector shall rely on ACPI _PLD initially. Before registering itself as the aggregate, the connector will find all other ACPI devices that have matching _PLD crc hash with it (matching value in the pld_crc member of struct acpi_device), and add a component match entry for each one of them. Because only ACPI is supported for now, the driver shall only be build when ACPI is supported. This removes the need for the custom API that the driver exposed. The components and the connector can therefore exist completely independently of each other. The order in which they are registered, as well as are they modules or not, is now irrelevant. Acked-by: Rafael J. Wysocki Signed-off-by: Heikki Krogerus Link: https://lore.kernel.org/r/20211223082422.45637-1-heikki.krogerus@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h index e2e44bb1dad8..7ba45a97eeae 100644 --- a/include/linux/usb/typec.h +++ b/include/linux/usb/typec.h @@ -305,16 +305,4 @@ void typec_partner_set_svdm_version(struct typec_partner *partner, enum usb_pd_svdm_ver svdm_version); int typec_get_negotiated_svdm_version(struct typec_port *port); -#if IS_REACHABLE(CONFIG_TYPEC) -int typec_link_port(struct device *port); -void typec_unlink_port(struct device *port); -#else -static inline int typec_link_port(struct device *port) -{ - return 0; -} - -static inline void typec_unlink_port(struct device *port) { } -#endif - #endif /* __LINUX_USB_TYPEC_H */ -- cgit v1.2.3 From 510a0bdb2bfcff8d7be822c72adc3add7a97d559 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Thu, 23 Dec 2021 11:24:32 +0300 Subject: usb: Remove usb_for_each_port() There are no more users for the function. Reviewed-by: Andy Shevchenko Acked-by: Rafael J. Wysocki Signed-off-by: Heikki Krogerus Link: https://lore.kernel.org/r/20211223082432.45653-1-heikki.krogerus@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 7ccaa76a9a96..200b7b79acb5 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -875,15 +875,6 @@ extern struct usb_host_interface *usb_find_alt_setting( unsigned int iface_num, unsigned int alt_num); -#if IS_REACHABLE(CONFIG_USB) -int usb_for_each_port(void *data, int (*fn)(struct device *, void *)); -#else -static inline int usb_for_each_port(void *data, int (*fn)(struct device *, void *)) -{ - return 0; -} -#endif - /* port claiming functions */ int usb_hub_claim_port(struct usb_device *hdev, unsigned port1, struct usb_dev_state *owner); -- cgit v1.2.3 From b4a29b94804c4774f22555651296b838df6ec0e4 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 28 Dec 2021 18:22:00 +0100 Subject: serial: 8250: Move Alpha-specific quirk out of the core struct uart_8250_port contains mcr_mask and mcr_force members whose sole purpose is to work around an Alpha-specific quirk. This code doesn't belong in the core where it is executed by everyone else, so move it to a proper ->set_mctrl callback which is used on the affected Alpha machine only. The quirk was introduced in January 1995: https://git.kernel.org/pub/scm/linux/kernel/git/history/history.git/diff/drivers/char/serial.c?h=1.1.83 The members in struct uart_8250_port were added in 2002: https://git.kernel.org/history/history/c/4524aad27854 The quirk applies to non-PCI Alphas and arch/alpha/Kconfig specifies "select FORCE_PCI if !ALPHA_JENSEN". So apparently the only affected machine is the EISA-based Jensen that Linus was working on back then: https://lore.kernel.org/all/CAHk-=wj1JWZ3sCrGz16nxEj7=0O+srMg6Ah3iPTDXSPKEws_SA@mail.gmail.com/ Up until now the quirk is not applied unless CONFIG_PCI is disabled. If users forget to do that or run a generic Alpha kernel, the serial ports aren't usable on Jensen. Avoid by confining the quirk to CONFIG_ALPHA_JENSEN instead of !CONFIG_PCI. On generic Alpha kernels, auto-detect at runtime whether the quirk needs to be applied. Cc: Russell King Cc: Ulrich Teichert Cc: Linus Torvalds Signed-off-by: Lukas Wunner Link: https://lore.kernel.org/r/b83d069cb516549b8a5420e097bb6bdd806f36fc.1640695609.git.lukas@wunner.de Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_8250.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index 5db211f43b29..ff84a3ed10ea 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -104,8 +104,6 @@ struct uart_8250_port { unsigned char ier; unsigned char lcr; unsigned char mcr; - unsigned char mcr_mask; /* mask of user bits */ - unsigned char mcr_force; /* mask of forced bits */ unsigned char cur_iotype; /* Running I/O type */ unsigned int rpm_tx_active; unsigned char canary; /* non-zero during system sleep -- cgit v1.2.3 From d8e9a406a931f687945703a4bac45042eb81ce92 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Sun, 12 Dec 2021 22:21:28 +0900 Subject: serdev: BREAK/FRAME/PARITY/OVERRUN notification prototype V2 Allow serdev device drivers get notified by hardware errors such as BREAK, FRAME, PARITY and OVERRUN. With this patch, in the event of an error detected in the UART device driver the serdev_device_driver will get the newly introduced ->error() callback invoked if serdev_device_set_error_mask() has previously been used to enable the type of error. The errors are taken straight from the TTY layer and fed into the serdev_device_driver after filtering out only enabled errors. Without this patch the hardware errors never reach the serdev_device_driver. Signed-off-by: Magnus Damm Link: https://lore.kernel.org/r/163931528842.27756.3665040315954968747.sendpatchset@octo Signed-off-by: Greg Kroah-Hartman --- include/linux/serdev.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serdev.h b/include/linux/serdev.h index 3368c261ab62..0d0b22fc7e37 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -19,12 +19,15 @@ struct serdev_device; /** * struct serdev_device_ops - Callback operations for a serdev device + * @error: Function called with errors received from device; + * may sleep. * @receive_buf: Function called with data received from device; * returns number of bytes accepted; may sleep. * @write_wakeup: Function called when ready to transmit more data; must * not sleep. */ struct serdev_device_ops { + void (*error)(struct serdev_device *, unsigned long); int (*receive_buf)(struct serdev_device *, const unsigned char *, size_t); void (*write_wakeup)(struct serdev_device *); }; @@ -76,6 +79,11 @@ enum serdev_parity { SERDEV_PARITY_ODD, }; +#define SERDEV_ERROR_BREAK 0 +#define SERDEV_ERROR_FRAME 1 +#define SERDEV_ERROR_PARITY 2 +#define SERDEV_ERROR_OVERRUN 3 + /* * serdev controller structures */ @@ -85,6 +93,7 @@ struct serdev_controller_ops { int (*write_room)(struct serdev_controller *); int (*open)(struct serdev_controller *); void (*close)(struct serdev_controller *); + void (*set_error_mask)(struct serdev_controller *, unsigned long); void (*set_flow_control)(struct serdev_controller *, bool); int (*set_parity)(struct serdev_controller *, enum serdev_parity); unsigned int (*set_baudrate)(struct serdev_controller *, unsigned int); @@ -190,12 +199,24 @@ static inline int serdev_controller_receive_buf(struct serdev_controller *ctrl, return serdev->ops->receive_buf(serdev, data, count); } +static inline void serdev_controller_error(struct serdev_controller *ctrl, + unsigned long errors) +{ + struct serdev_device *serdev = ctrl->serdev; + + if (!serdev || !serdev->ops->error) + return; + + serdev->ops->error(serdev, errors); +} + #if IS_ENABLED(CONFIG_SERIAL_DEV_BUS) int serdev_device_open(struct serdev_device *); void serdev_device_close(struct serdev_device *); int devm_serdev_device_open(struct device *, struct serdev_device *); unsigned int serdev_device_set_baudrate(struct serdev_device *, unsigned int); +void serdev_device_set_error_mask(struct serdev_device *, unsigned long); void serdev_device_set_flow_control(struct serdev_device *, bool); int serdev_device_write_buf(struct serdev_device *, const unsigned char *, size_t); void serdev_device_wait_until_sent(struct serdev_device *, long); @@ -238,6 +259,7 @@ static inline unsigned int serdev_device_set_baudrate(struct serdev_device *sdev { return 0; } +static inline void serdev_device_set_error_mask(struct serdev_device *sdev, unsigned long mask) {} static inline void serdev_device_set_flow_control(struct serdev_device *sdev, bool enable) {} static inline int serdev_device_write_buf(struct serdev_device *serdev, const unsigned char *buf, -- cgit v1.2.3 From 5207fb2f311b0c45a9abfa1c84b7a7b657ffa550 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 30 Dec 2021 16:02:41 +0100 Subject: counter: Provide a wrapper to access device private data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For now this just wraps accessing struct counter_device::priv. However this is about to change and converting drivers to this helper individually makes fixing device lifetime issues result in easier to review patches. Reviewed-by: Jonathan Cameron Acked-by: William Breathitt Gray Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20211230150300.72196-5-u.kleine-koenig@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- include/linux/counter.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/counter.h b/include/linux/counter.h index dfbde2808998..627f1757f6bb 100644 --- a/include/linux/counter.h +++ b/include/linux/counter.h @@ -329,6 +329,8 @@ struct counter_device { struct mutex ops_exist_lock; }; +void *counter_priv(const struct counter_device *const counter); + int counter_register(struct counter_device *const counter); void counter_unregister(struct counter_device *const counter); int devm_counter_register(struct device *dev, -- cgit v1.2.3 From c18e2760308e30f007fa24b558b87c39d7e86ff1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 30 Dec 2021 16:02:50 +0100 Subject: counter: Provide alternative counter registration functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current implementation gets device lifetime tracking wrong. The problem is that allocation of struct counter_device is controlled by the individual drivers but this structure contains a struct device that might have to live longer than a driver is bound. As a result a command sequence like: { sleep 5; echo bang; } > /dev/counter0 & sleep 1; echo 40000000.timer:counter > /sys/bus/platform/drivers/stm32-timer-counter/unbind can keep a reference to the struct device and unbinding results in freeing the memory occupied by this device resulting in an oops. This commit provides two new functions (plus some helpers): - counter_alloc() to allocate a struct counter_device that is automatically freed once the embedded struct device is released - counter_add() to register such a device. Note that this commit doesn't fix any issues, all drivers have to be converted to these new functions to correct the lifetime problems. Reviewed-by: Jonathan Cameron Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20211230150300.72196-14-u.kleine-koenig@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- include/linux/counter.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/counter.h b/include/linux/counter.h index 627f1757f6bb..ed8d5820f0d1 100644 --- a/include/linux/counter.h +++ b/include/linux/counter.h @@ -327,14 +327,29 @@ struct counter_device { spinlock_t events_in_lock; struct mutex events_out_lock; struct mutex ops_exist_lock; + + /* + * This can go away once all drivers are converted to + * counter_alloc()/counter_add(). + */ + bool legacy_device; }; void *counter_priv(const struct counter_device *const counter); int counter_register(struct counter_device *const counter); + +struct counter_device *counter_alloc(size_t sizeof_priv); +void counter_put(struct counter_device *const counter); +int counter_add(struct counter_device *const counter); + void counter_unregister(struct counter_device *const counter); int devm_counter_register(struct device *dev, struct counter_device *const counter); +struct counter_device *devm_counter_alloc(struct device *dev, + size_t sizeof_priv); +int devm_counter_add(struct device *dev, + struct counter_device *const counter); void counter_push_event(struct counter_device *const counter, const u8 event, const u8 channel); -- cgit v1.2.3 From f2ee4759fb700b32a1bd830960fe86bf6bdfd0ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 30 Dec 2021 16:03:00 +0100 Subject: counter: remove old and now unused registration API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Usage of counter_register() yields issues in device lifetime tracking. All drivers were converted to the new API, so the old one can go away. Reviewed-by: Jonathan Cameron Acked-by: William Breathitt Gray Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20211230150300.72196-24-u.kleine-koenig@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- include/linux/counter.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/counter.h b/include/linux/counter.h index ed8d5820f0d1..1fe17f5adb09 100644 --- a/include/linux/counter.h +++ b/include/linux/counter.h @@ -314,8 +314,6 @@ struct counter_device { struct counter_comp *ext; size_t num_ext; - void *priv; - struct device dev; struct cdev chrdev; struct list_head events_list; @@ -327,25 +325,15 @@ struct counter_device { spinlock_t events_in_lock; struct mutex events_out_lock; struct mutex ops_exist_lock; - - /* - * This can go away once all drivers are converted to - * counter_alloc()/counter_add(). - */ - bool legacy_device; }; void *counter_priv(const struct counter_device *const counter); -int counter_register(struct counter_device *const counter); - struct counter_device *counter_alloc(size_t sizeof_priv); void counter_put(struct counter_device *const counter); int counter_add(struct counter_device *const counter); void counter_unregister(struct counter_device *const counter); -int devm_counter_register(struct device *dev, - struct counter_device *const counter); struct counter_device *devm_counter_alloc(struct device *dev, size_t sizeof_priv); int devm_counter_add(struct device *dev, -- cgit v1.2.3 From d6c6d0bb2cb3af91c9c1546af7cdf4770d0df443 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 29 Dec 2021 12:36:20 +0100 Subject: net: remove references to CONFIG_IRDA in network header files Commit d64c2a76123f ("staging: irda: remove the irda network stack and drivers") removes the config IRDA. Remove the remaining references to this non-existing config in the network header files. Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20211229113620.19368-1-lukas.bulwahn@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/atalk.h | 2 +- include/linux/netdevice.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/atalk.h b/include/linux/atalk.h index f6034ba774be..a55bfc6567d0 100644 --- a/include/linux/atalk.h +++ b/include/linux/atalk.h @@ -113,7 +113,7 @@ extern int aarp_proto_init(void); /* Inter module exports */ /* Give a device find its atif control structure */ -#if IS_ENABLED(CONFIG_IRDA) || IS_ENABLED(CONFIG_ATALK) +#if IS_ENABLED(CONFIG_ATALK) static inline struct atalk_iface *atalk_find_dev(struct net_device *dev) { return dev->atalk_ptr; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2684bdb6defa..6f99c8f51b60 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2098,7 +2098,7 @@ struct net_device { #if IS_ENABLED(CONFIG_TIPC) struct tipc_bearer __rcu *tipc_ptr; #endif -#if IS_ENABLED(CONFIG_IRDA) || IS_ENABLED(CONFIG_ATALK) +#if IS_ENABLED(CONFIG_ATALK) void *atalk_ptr; #endif struct in_device __rcu *ip_ptr; -- cgit v1.2.3 From c3fb0e280b4cdbf382f59eb6b276e4c6047bc219 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 18 Nov 2021 02:32:37 +0200 Subject: net/mlx5: DR, Fix lower case macro prefix "mlx5_" to "MLX5_" Macros prefix should be capital letters - fix the prefix in mlx5_FLEX_PARSER_MPLS_OVER_UDP_ENABLED. Signed-off-by: Yevgeny Kliteynik --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index e9db12aae8f9..18b816b41545 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1291,7 +1291,7 @@ enum { enum { MLX5_FLEX_PARSER_GENEVE_ENABLED = 1 << 3, MLX5_FLEX_PARSER_MPLS_OVER_GRE_ENABLED = 1 << 4, - mlx5_FLEX_PARSER_MPLS_OVER_UDP_ENABLED = 1 << 5, + MLX5_FLEX_PARSER_MPLS_OVER_UDP_ENABLED = 1 << 5, MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED = 1 << 7, MLX5_FLEX_PARSER_ICMP_V4_ENABLED = 1 << 8, MLX5_FLEX_PARSER_ICMP_V6_ENABLED = 1 << 9, -- cgit v1.2.3 From 0f2a6c3b9219bdf497750258cd2ad513f0056b42 Mon Sep 17 00:00:00 2001 From: Muhammad Sammar Date: Sun, 5 Sep 2021 15:16:21 +0300 Subject: net/mlx5: Add misc5 flow table match parameters Add support for misc5 match parameter as per HW spec, this will allow matching on tunnel_header fields. Signed-off-by: Muhammad Sammar Signed-off-by: Yevgeny Kliteynik --- include/linux/mlx5/device.h | 1 + include/linux/mlx5/mlx5_ifc.h | 25 ++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 9c25edfd59a6..604b85dd770a 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1117,6 +1117,7 @@ enum { MLX5_MATCH_MISC_PARAMETERS_2 = 1 << 3, MLX5_MATCH_MISC_PARAMETERS_3 = 1 << 4, MLX5_MATCH_MISC_PARAMETERS_4 = 1 << 5, + MLX5_MATCH_MISC_PARAMETERS_5 = 1 << 6, }; enum { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 18b816b41545..c74c5e147cb9 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -670,6 +670,26 @@ struct mlx5_ifc_fte_match_set_misc4_bits { u8 reserved_at_100[0x100]; }; +struct mlx5_ifc_fte_match_set_misc5_bits { + u8 macsec_tag_0[0x20]; + + u8 macsec_tag_1[0x20]; + + u8 macsec_tag_2[0x20]; + + u8 macsec_tag_3[0x20]; + + u8 tunnel_header_0[0x20]; + + u8 tunnel_header_1[0x20]; + + u8 tunnel_header_2[0x20]; + + u8 tunnel_header_3[0x20]; + + u8 reserved_at_100[0x100]; +}; + struct mlx5_ifc_cmd_pas_bits { u8 pa_h[0x20]; @@ -1839,7 +1859,9 @@ struct mlx5_ifc_fte_match_param_bits { struct mlx5_ifc_fte_match_set_misc4_bits misc_parameters_4; - u8 reserved_at_c00[0x400]; + struct mlx5_ifc_fte_match_set_misc5_bits misc_parameters_5; + + u8 reserved_at_e00[0x200]; }; enum { @@ -5977,6 +5999,7 @@ enum { MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_2 = 0x3, MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_3 = 0x4, MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_4 = 0x5, + MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_5 = 0x6, }; struct mlx5_ifc_query_flow_group_out_bits { -- cgit v1.2.3 From f59464e257bdbd4df6df9a4505d7858a0baf6cf7 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Mon, 8 Nov 2021 02:42:50 +0200 Subject: net/mlx5: DR, Add support for matching on geneve_tlv_option_0_exist field Match on geneve_tlv_option_0_exist field on devices that support STEv1. Signed-off-by: Muhammad Sammar Signed-off-by: Yevgeny Kliteynik --- include/linux/mlx5/mlx5_ifc.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index c74c5e147cb9..deaa0f71213f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -372,7 +372,8 @@ struct mlx5_ifc_flow_table_fields_supported_bits { u8 reserved_at_37[0x9]; u8 geneve_tlv_option_0_data[0x1]; - u8 reserved_at_41[0x4]; + u8 geneve_tlv_option_0_exist[0x1]; + u8 reserved_at_42[0x3]; u8 outer_first_mpls_over_udp[0x4]; u8 outer_first_mpls_over_gre[0x4]; u8 inner_first_mpls[0x4]; @@ -551,7 +552,8 @@ struct mlx5_ifc_fte_match_set_misc_bits { u8 bth_opcode[0x8]; u8 geneve_vni[0x18]; - u8 reserved_at_d8[0x7]; + u8 reserved_at_d8[0x6]; + u8 geneve_tlv_option_0_exist[0x1]; u8 geneve_oam[0x1]; u8 reserved_at_e0[0xc]; -- cgit v1.2.3 From 4ff725e1d4adfd313bc9767523fc8d6e90d50f9c Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Tue, 23 Nov 2021 02:11:12 +0200 Subject: net/mlx5: DR, Ignore modify TTL if device doesn't support it When modifying TTL, packet's csum has to be recalculated. Due to HW issue in ConnectX-5, csum recalculation for modify TTL is supported through a work-around that is specifically enabled by configuration. If the work-around isn't enabled, ignore the modify TTL action rather than adding an unsupported action. Signed-off-by: Yevgeny Kliteynik --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index deaa0f71213f..598ac3bcc901 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -833,7 +833,7 @@ struct mlx5_ifc_flow_table_eswitch_cap_bits { u8 fdb_to_vport_reg_c_id[0x8]; u8 reserved_at_8[0xd]; u8 fdb_modify_header_fwd_to_table[0x1]; - u8 reserved_at_16[0x1]; + u8 fdb_ipv4_ttl_modify[0x1]; u8 flow_source[0x1]; u8 reserved_at_18[0x2]; u8 multi_fdb_encap[0x1]; -- cgit v1.2.3 From 99a507a8ea28542ec196e2dd80096708e2482735 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 31 Dec 2021 13:42:30 +0100 Subject: Revert "serdev: BREAK/FRAME/PARITY/OVERRUN notification prototype V2" This reverts commit d8e9a406a931f687945703a4bac45042eb81ce92. It needs some future changes as pointed out by Johan and is not ready to be merged just yet. Reported-by: Johan Hovold Cc: Magnus Damm Link: https://lore.kernel.org/r/Yc7oZ/1tu95Z4wPS@hovoldconsulting.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serdev.h | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serdev.h b/include/linux/serdev.h index 0d0b22fc7e37..3368c261ab62 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -19,15 +19,12 @@ struct serdev_device; /** * struct serdev_device_ops - Callback operations for a serdev device - * @error: Function called with errors received from device; - * may sleep. * @receive_buf: Function called with data received from device; * returns number of bytes accepted; may sleep. * @write_wakeup: Function called when ready to transmit more data; must * not sleep. */ struct serdev_device_ops { - void (*error)(struct serdev_device *, unsigned long); int (*receive_buf)(struct serdev_device *, const unsigned char *, size_t); void (*write_wakeup)(struct serdev_device *); }; @@ -79,11 +76,6 @@ enum serdev_parity { SERDEV_PARITY_ODD, }; -#define SERDEV_ERROR_BREAK 0 -#define SERDEV_ERROR_FRAME 1 -#define SERDEV_ERROR_PARITY 2 -#define SERDEV_ERROR_OVERRUN 3 - /* * serdev controller structures */ @@ -93,7 +85,6 @@ struct serdev_controller_ops { int (*write_room)(struct serdev_controller *); int (*open)(struct serdev_controller *); void (*close)(struct serdev_controller *); - void (*set_error_mask)(struct serdev_controller *, unsigned long); void (*set_flow_control)(struct serdev_controller *, bool); int (*set_parity)(struct serdev_controller *, enum serdev_parity); unsigned int (*set_baudrate)(struct serdev_controller *, unsigned int); @@ -199,24 +190,12 @@ static inline int serdev_controller_receive_buf(struct serdev_controller *ctrl, return serdev->ops->receive_buf(serdev, data, count); } -static inline void serdev_controller_error(struct serdev_controller *ctrl, - unsigned long errors) -{ - struct serdev_device *serdev = ctrl->serdev; - - if (!serdev || !serdev->ops->error) - return; - - serdev->ops->error(serdev, errors); -} - #if IS_ENABLED(CONFIG_SERIAL_DEV_BUS) int serdev_device_open(struct serdev_device *); void serdev_device_close(struct serdev_device *); int devm_serdev_device_open(struct device *, struct serdev_device *); unsigned int serdev_device_set_baudrate(struct serdev_device *, unsigned int); -void serdev_device_set_error_mask(struct serdev_device *, unsigned long); void serdev_device_set_flow_control(struct serdev_device *, bool); int serdev_device_write_buf(struct serdev_device *, const unsigned char *, size_t); void serdev_device_wait_until_sent(struct serdev_device *, long); @@ -259,7 +238,6 @@ static inline unsigned int serdev_device_set_baudrate(struct serdev_device *sdev { return 0; } -static inline void serdev_device_set_error_mask(struct serdev_device *sdev, unsigned long mask) {} static inline void serdev_device_set_flow_control(struct serdev_device *sdev, bool enable) {} static inline int serdev_device_write_buf(struct serdev_device *serdev, const unsigned char *buf, -- cgit v1.2.3 From 9a45ac2320d0a6ae01880a30d4b86025fce4061b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 22 Dec 2021 22:41:18 -0500 Subject: fbdev: fbmem: add a helper to determine if an aperture is used by a fw fb Add a function for drivers to check if the a firmware initialized fb is corresponds to their aperture. This allows drivers to check if the device corresponds to what the firmware set up as the display device. Bug: https://bugzilla.kernel.org/show_bug.cgi?id=215203 Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1840 Signed-off-by: Alex Deucher --- include/linux/fb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 6f3db99ab990..3da95842b207 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -610,6 +610,7 @@ extern int remove_conflicting_pci_framebuffers(struct pci_dev *pdev, const char *name); extern int remove_conflicting_framebuffers(struct apertures_struct *a, const char *name, bool primary); +extern bool is_firmware_framebuffer(struct apertures_struct *a); extern int fb_prepare_logo(struct fb_info *fb_info, int rotate); extern int fb_show_logo(struct fb_info *fb_info, int rotate); extern char* fb_get_buffer_offset(struct fb_info *info, struct fb_pixmap *buf, u32 size); -- cgit v1.2.3 From 1b4e3f26f9f7553b260b8aed43967500961448a6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 2 Dec 2021 15:06:14 +0000 Subject: mm: vmscan: Reduce throttling due to a failure to make progress Mike Galbraith, Alexey Avramov and Darrick Wong all reported similar problems due to reclaim throttling for excessive lengths of time. In Alexey's case, a memory hog that should go OOM quickly stalls for several minutes before stalling. In Mike and Darrick's cases, a small memcg environment stalled excessively even though the system had enough memory overall. Commit 69392a403f49 ("mm/vmscan: throttle reclaim when no progress is being made") introduced the problem although commit a19594ca4a8b ("mm/vmscan: increase the timeout if page reclaim is not making progress") made it worse. Systems at or near an OOM state that cannot be recovered must reach OOM quickly and memcg should kill tasks if a memcg is near OOM. To address this, only stall for the first zone in the zonelist, reduce the timeout to 1 tick for VMSCAN_THROTTLE_NOPROGRESS and only stall if the scan control nr_reclaimed is 0, kswapd is still active and there were excessive pages pending for writeback. If kswapd has stopped reclaiming due to excessive failures, do not stall at all so that OOM triggers relatively quickly. Similarly, if an LRU is simply congested, only lightly throttle similar to NOPROGRESS. Alexey's original case was the most straight forward for i in {1..3}; do tail /dev/zero; done On vanilla 5.16-rc1, this test stalled heavily, after the patch the test completes in a few seconds similar to 5.15. Alexey's second test case added watching a youtube video while tail runs 10 times. On 5.15, playback only jitters slightly, 5.16-rc1 stalls a lot with lots of frames missing and numerous audio glitches. With this patch applies, the video plays similarly to 5.15. [lkp@intel.com: Fix W=1 build warning] Link: https://lore.kernel.org/r/99e779783d6c7fce96448a3402061b9dc1b3b602.camel@gmx.de Link: https://lore.kernel.org/r/20211124011954.7cab9bb4@mail.inbox.lv Link: https://lore.kernel.org/r/20211022144651.19914-1-mgorman@techsingularity.net Link: https://lore.kernel.org/r/20211202150614.22440-1-mgorman@techsingularity.net Link: https://linux-regtracking.leemhuis.info/regzbot/regression/20211124011954.7cab9bb4@mail.inbox.lv/ Reported-and-tested-by: Alexey Avramov Reported-and-tested-by: Mike Galbraith Reported-and-tested-by: Darrick J. Wong Reported-by: kernel test robot Acked-by: Hugh Dickins Tracked-by: Thorsten Leemhuis Fixes: 69392a403f49 ("mm/vmscan: throttle reclaim when no progress is being made") Signed-off-by: Mel Gorman Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 58e744b78c2c..936dc0b6c226 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -277,6 +277,7 @@ enum vmscan_throttle_state { VMSCAN_THROTTLE_WRITEBACK, VMSCAN_THROTTLE_ISOLATED, VMSCAN_THROTTLE_NOPROGRESS, + VMSCAN_THROTTLE_CONGESTED, NR_VMSCAN_THROTTLE, }; -- cgit v1.2.3 From e7026f15564fbe0c8b091f218203111f77b84eda Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Tue, 28 Dec 2021 21:03:06 -0800 Subject: net: phy: lynx: refactor Lynx PCS module to use generic phylink_pcs Remove references to lynx_pcs structures so drivers like the Felix DSA can reference alternate PCS drivers. Signed-off-by: Colin Foster Signed-off-by: David S. Miller --- include/linux/pcs-lynx.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pcs-lynx.h b/include/linux/pcs-lynx.h index a6440d6ebe95..5712cc2ce775 100644 --- a/include/linux/pcs-lynx.h +++ b/include/linux/pcs-lynx.h @@ -9,13 +9,10 @@ #include #include -struct lynx_pcs { - struct phylink_pcs pcs; - struct mdio_device *mdio; -}; +struct mdio_device *lynx_get_mdio_device(struct phylink_pcs *pcs); -struct lynx_pcs *lynx_pcs_create(struct mdio_device *mdio); +struct phylink_pcs *lynx_pcs_create(struct mdio_device *mdio); -void lynx_pcs_destroy(struct lynx_pcs *pcs); +void lynx_pcs_destroy(struct phylink_pcs *pcs); #endif /* __LINUX_PCS_LYNX_H */ -- cgit v1.2.3 From ece014141cd4b49f2d763f28b19e417b84460560 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 3 May 2021 07:29:47 -0400 Subject: mm/doc: Add documentation for folio_test_uptodate Move the PG_uptodate documentation to be documentation for folio_test_uptodate() and expand on it a little. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski --- include/linux/page-flags.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b5f14d581113..b3d353d537e2 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -68,9 +68,6 @@ * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as * a result of MADV_FREE). * - * PG_uptodate tells whether the page's contents is valid. When a read - * completes, the page becomes uptodate, unless a disk I/O error happened. - * * PG_referenced, PG_reclaim are used for page reclaim for anonymous and * file-backed pagecache (see mm/vmscan.c). * @@ -615,6 +612,16 @@ TESTPAGEFLAG_FALSE(Ksm, ksm) u64 stable_page_flags(struct page *page); +/** + * folio_test_uptodate - Is this folio up to date? + * @folio: The folio. + * + * The uptodate flag is set on a folio when every byte in the folio is + * at least as new as the corresponding bytes on storage. Anonymous + * and CoW folios are always uptodate. If the folio is not uptodate, + * some of the bytes in it may be; see the is_partially_uptodate() + * address_space operation. + */ static inline bool folio_test_uptodate(struct folio *folio) { bool ret = test_bit(PG_uptodate, folio_flags(folio, 0)); -- cgit v1.2.3 From 10331795fb7991a39ebd0330fdb074cbd81fef48 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Dec 2021 15:24:51 -0500 Subject: pagevec: Add folio_batch The folio_batch is the same as the pagevec, except that it is typed to contain folios and not pages. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski --- include/linux/pagevec.h | 69 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 7f3f19065a9f..c3fa616d7ae7 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -15,8 +15,10 @@ #define PAGEVEC_SIZE 15 struct page; +struct folio; struct address_space; +/* Layout must match folio_batch */ struct pagevec { unsigned char nr; bool percpu_pvec_drained; @@ -81,4 +83,71 @@ static inline void pagevec_release(struct pagevec *pvec) __pagevec_release(pvec); } +/** + * struct folio_batch - A collection of folios. + * + * The folio_batch is used to amortise the cost of retrieving and + * operating on a set of folios. The order of folios in the batch may be + * significant (eg delete_from_page_cache_batch()). Some users of the + * folio_batch store "exceptional" entries in it which can be removed + * by calling folio_batch_remove_exceptionals(). + */ +struct folio_batch { + unsigned char nr; + bool percpu_pvec_drained; + struct folio *folios[PAGEVEC_SIZE]; +}; + +/* Layout must match pagevec */ +static_assert(sizeof(struct pagevec) == sizeof(struct folio_batch)); +static_assert(offsetof(struct pagevec, pages) == + offsetof(struct folio_batch, folios)); + +/** + * folio_batch_init() - Initialise a batch of folios + * @fbatch: The folio batch. + * + * A freshly initialised folio_batch contains zero folios. + */ +static inline void folio_batch_init(struct folio_batch *fbatch) +{ + fbatch->nr = 0; +} + +static inline unsigned int folio_batch_count(struct folio_batch *fbatch) +{ + return fbatch->nr; +} + +static inline unsigned int fbatch_space(struct folio_batch *fbatch) +{ + return PAGEVEC_SIZE - fbatch->nr; +} + +/** + * folio_batch_add() - Add a folio to a batch. + * @fbatch: The folio batch. + * @folio: The folio to add. + * + * The folio is added to the end of the batch. + * The batch must have previously been initialised using folio_batch_init(). + * + * Return: The number of slots still available. + */ +static inline unsigned folio_batch_add(struct folio_batch *fbatch, + struct folio *folio) +{ + fbatch->folios[fbatch->nr++] = folio; + return fbatch_space(fbatch); +} + +static inline void folio_batch_release(struct folio_batch *fbatch) +{ + pagevec_release((struct pagevec *)fbatch); +} + +static inline void folio_batch_remove_exceptionals(struct folio_batch *fbatch) +{ + pagevec_remove_exceptionals((struct pagevec *)fbatch); +} #endif /* _LINUX_PAGEVEC_H */ -- cgit v1.2.3 From e66d70c034dbdfe1a48863f0865ac86aaf2fef1a Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Mon, 13 Dec 2021 10:51:41 +0530 Subject: dmaengine: xilinx_dpdma: use correct SDPX tag for header file Commit 188c310bdd5d ("dmaengine: xilinx_dpdma: stop using slave_id field") add the header file with incorrect format for SPDX tag, fix that WARNING: Improper SPDX comment style for 'include/linux/dma/xilinx_dpdma.h', please use '/*' instead #1: FILE: include/linux/dma/xilinx_dpdma.h:1: +// SPDX-License-Identifier: GPL-2.0 WARNING: Missing or malformed SPDX-License-Identifier tag in line 1 #1: FILE: include/linux/dma/xilinx_dpdma.h:1: +// SPDX-License-Identifier: GPL-2.0 Fixes: 188c310bdd5d ("dmaengine: xilinx_dpdma: stop using slave_id field") Signed-off-by: Vinod Koul Link: https://lore.kernel.org/r/20211213052141.850807-1-vkoul@kernel.org Signed-off-by: Vinod Koul --- include/linux/dma/xilinx_dpdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dma/xilinx_dpdma.h b/include/linux/dma/xilinx_dpdma.h index 83a1377f03f8..02a4adf8921b 100644 --- a/include/linux/dma/xilinx_dpdma.h +++ b/include/linux/dma/xilinx_dpdma.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_DMA_XILINX_DPDMA_H #define __LINUX_DMA_XILINX_DPDMA_H -- cgit v1.2.3 From 25fd330370ac40653671f323acc7fb6db27ef6fe Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 15 Dec 2021 02:01:18 +0100 Subject: power: supply_core: Pass pointer to battery info The function to retrieve battery info (from the device tree) assumes we have a static info struct that gets populated by calling into power_supply_get_battery_info(). This is awkward since I want to support tables of static battery info by just assigning a pointer to all info based on e.g. a compatible value in the device tree. We also have a mixture of static and dynamically allocated variables here. Bite the bullet and let power_supply_get_battery_info() allocate also the memory used for the very top level struct power_supply_battery_info container. Pass pointers around and lifecycle this with the psy device just like the stuff we allocate inside it. Change all current users over. As part of the change, initializers need to be added to some previously uninitialized fields in struct power_supply_battery_info. Reviewed-By: Matti Vaittinen Signed-off-by: Linus Walleij Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index f6e94eae4f28..86b4d5c4dab9 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -575,7 +575,7 @@ devm_power_supply_get_by_phandle(struct device *dev, const char *property) #endif /* CONFIG_OF */ extern int power_supply_get_battery_info(struct power_supply *psy, - struct power_supply_battery_info *info); + struct power_supply_battery_info **info_out); extern void power_supply_put_battery_info(struct power_supply *psy, struct power_supply_battery_info *info); extern int power_supply_ocv2cap_simple(struct power_supply_battery_ocv_table *table, -- cgit v1.2.3 From d5dbcca70182501bed99de85c224cef04c38ed92 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 3 Jan 2022 17:24:08 +0100 Subject: pktcdvd: convert to use attribute groups There is no need to create kobject children of the pktcdvd device just to display a subdirectory name. Instead, use a named attribute group which removes the extra kobjects and also fixes the userspace race where the device is created yet tools like libudev can not see the attributes as they think the subdirectories are some other sort of device. Cc: linux-block@vger.kernel.org Cc: Jens Axboe Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20220103162408.742003-1-gregkh@linuxfoundation.org Signed-off-by: Jens Axboe --- include/linux/pktcdvd.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h index c391e694aa26..f9c5ac80d59b 100644 --- a/include/linux/pktcdvd.h +++ b/include/linux/pktcdvd.h @@ -152,14 +152,6 @@ struct packet_stacked_data }; #define PSD_POOL_SIZE 64 -struct pktcdvd_kobj -{ - struct kobject kobj; - struct pktcdvd_device *pd; -}; -#define to_pktcdvdkobj(_k) \ - ((struct pktcdvd_kobj*)container_of(_k,struct pktcdvd_kobj,kobj)) - struct pktcdvd_device { struct block_device *bdev; /* dev attached */ @@ -197,8 +189,6 @@ struct pktcdvd_device int write_congestion_on; struct device *dev; /* sysfs pktcdvd[0-7] dev */ - struct pktcdvd_kobj *kobj_stat; /* sysfs pktcdvd[0-7]/stat/ */ - struct pktcdvd_kobj *kobj_wqueue; /* sysfs pktcdvd[0-7]/write_queue/ */ struct dentry *dfs_d_root; /* debugfs: devname directory */ struct dentry *dfs_f_info; /* debugfs: info file */ -- cgit v1.2.3 From 6c952a0dc9c3ced98c4c8aa7cd11c25c59157f1f Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 21 Dec 2021 08:20:26 +0100 Subject: ata: libata: Add ata_port_classify() helper Add an ata_port_classify() helper to print out the results from the device classification and remove the debugging statements from ata_dev_classify(). Signed-off-by: Hannes Reinecke Signed-off-by: Damien Le Moal --- include/linux/libata.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 2a8404b26083..235fdbeb19ea 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1160,6 +1160,8 @@ extern enum ata_completion_errors ata_noop_qc_prep(struct ata_queued_cmd *qc); extern void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg, unsigned int n_elem); extern unsigned int ata_dev_classify(const struct ata_taskfile *tf); +extern unsigned int ata_port_classify(struct ata_port *ap, + const struct ata_taskfile *tf); extern void ata_dev_disable(struct ata_device *adev); extern void ata_id_string(const u16 *id, unsigned char *s, unsigned int ofs, unsigned int len); -- cgit v1.2.3 From e41294408c56c68ea0f269d757527bf33b39118a Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 3 Jan 2022 18:11:31 +0100 Subject: icmp: ICMPV6: Examine invoking packet for Segment Route Headers. RFC8754 says: ICMP error packets generated within the SR domain are sent to source nodes within the SR domain. The invoking packet in the ICMP error message may contain an SRH. Since the destination address of a packet with an SRH changes as each segment is processed, it may not be the destination used by the socket or application that generated the invoking packet. For the source of an invoking packet to process the ICMP error message, the ultimate destination address of the IPv6 header may be required. The following logic is used to determine the destination address for use by protocol-error handlers. * Walk all extension headers of the invoking IPv6 packet to the routing extension header preceding the upper-layer header. - If routing header is type 4 Segment Routing Header (SRH) o The SID at Segment List[0] may be used as the destination address of the invoking packet. Mangle the skb so the network header points to the invoking packet inside the ICMP packet. The seg6 helpers can then be used on the skb to find any segment routing headers. If found, mark this fact in the IPv6 control block of the skb, and store the offset into the packet of the SRH. Then restore the skb back to its old state. Signed-off-by: Andrew Lunn Reviewed-by: David Ahern Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/ipv6.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 20c1f968da7c..a59d25f19385 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -133,6 +133,7 @@ struct inet6_skb_parm { __u16 dsthao; #endif __u16 frag_max_size; + __u16 srhoff; #define IP6SKB_XFRM_TRANSFORMED 1 #define IP6SKB_FORWARDED 2 @@ -142,6 +143,7 @@ struct inet6_skb_parm { #define IP6SKB_HOPBYHOP 32 #define IP6SKB_L3SLAVE 64 #define IP6SKB_JUMBOGRAM 128 +#define IP6SKB_SEG6 256 }; #if defined(CONFIG_NET_L3_MASTER_DEV) -- cgit v1.2.3 From f083266487690124481eac0869da850406fb3ed3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 29 Aug 2021 09:18:53 +0200 Subject: headers/uninline: Uninline single-use function: kobject_has_children() This was the only usage of in , so we'll able to decouple the two after this change. Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- include/linux/kobject.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kobject.h b/include/linux/kobject.h index ad90b49824dc..c7b47399b36a 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -117,23 +117,6 @@ extern void kobject_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid); extern char *kobject_get_path(struct kobject *kobj, gfp_t flag); -/** - * kobject_has_children - Returns whether a kobject has children. - * @kobj: the object to test - * - * This will return whether a kobject has other kobjects as children. - * - * It does NOT account for the presence of attribute files, only sub - * directories. It also assumes there is no concurrent addition or - * removal of such children, and thus relies on external locking. - */ -static inline bool kobject_has_children(struct kobject *kobj) -{ - WARN_ON_ONCE(kref_read(&kobj->kref) == 0); - - return kobj->sd && kobj->sd->dir.subdirs; -} - struct kobj_type { void (*release)(struct kobject *kobj); const struct sysfs_ops *sysfs_ops; -- cgit v1.2.3 From d9c19d32d86fa54934b632c4314beb067bf98378 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 18 Oct 2021 10:39:06 -0400 Subject: iov_iter: Add copy_folio_to_iter() This wrapper around copy_page_to_iter() works because copy_page_to_iter() handles compound pages correctly. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski --- include/linux/uio.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 6350354f97e9..43321dbebba8 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -7,6 +7,7 @@ #include #include +#include #include struct page; @@ -146,6 +147,12 @@ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); +static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset, + size_t bytes, struct iov_iter *i) +{ + return copy_page_to_iter(&folio->page, offset, bytes, i); +} + static __always_inline __must_check size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { -- cgit v1.2.3 From 5bf34d7c7ffe773c3b3c1b6ebf39e0f34a2436ec Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 28 Nov 2021 14:24:43 -0500 Subject: mm: Add folio_test_pmd_mappable() Add a predicate to determine if the folio might be mapped by a PMD entry. If CONFIG_TRANSPARENT_HUGEPAGE is disabled, we know it can't be, even if it's large enough. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski --- include/linux/huge_mm.h | 14 ++++++++++++++ include/linux/mm.h | 42 +++++++++++++++++++++--------------------- 2 files changed, 35 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f280f33ff223..e4c18ba8d3bf 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -274,6 +274,15 @@ static inline int thp_nr_pages(struct page *page) return 1; } +/** + * folio_test_pmd_mappable - Can we map this folio with a PMD? + * @folio: The folio to test + */ +static inline bool folio_test_pmd_mappable(struct folio *folio) +{ + return folio_order(folio) >= HPAGE_PMD_ORDER; +} + struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap); struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, @@ -339,6 +348,11 @@ static inline int thp_nr_pages(struct page *page) return 1; } +static inline bool folio_test_pmd_mappable(struct folio *folio) +{ + return false; +} + static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) { return false; diff --git a/include/linux/mm.h b/include/linux/mm.h index a7e4a9e7d807..72ca04f16711 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -714,6 +714,27 @@ int vma_is_stack_for_current(struct vm_area_struct *vma); struct mmu_gather; struct inode; +static inline unsigned int compound_order(struct page *page) +{ + if (!PageHead(page)) + return 0; + return page[1].compound_order; +} + +/** + * folio_order - The allocation order of a folio. + * @folio: The folio. + * + * A folio is composed of 2^order pages. See get_order() for the definition + * of order. + * + * Return: The order of the folio. + */ +static inline unsigned int folio_order(struct folio *folio) +{ + return compound_order(&folio->page); +} + #include /* @@ -906,27 +927,6 @@ static inline void destroy_compound_page(struct page *page) compound_page_dtors[page[1].compound_dtor](page); } -static inline unsigned int compound_order(struct page *page) -{ - if (!PageHead(page)) - return 0; - return page[1].compound_order; -} - -/** - * folio_order - The allocation order of a folio. - * @folio: The folio. - * - * A folio is composed of 2^order pages. See get_order() for the definition - * of order. - * - * Return: The order of the folio. - */ -static inline unsigned int folio_order(struct folio *folio) -{ - return compound_order(&folio->page); -} - static inline bool hpage_pincount_available(struct page *page) { /* -- cgit v1.2.3 From 9f2b04a25a41b1f41b3cead4f56854a4192ec5b0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Aug 2021 23:36:31 -0400 Subject: filemap: Add folio_put_wait_locked() Convert all three callers of put_and_wait_on_page_locked() to folio_put_wait_locked(). This shrinks the kernel overall by 19 bytes. filemap_update_page() shrinks by 19 bytes while __migration_entry_wait() is unchanged. folio_put_wait_locked() is 14 bytes smaller than put_and_wait_on_page_locked(), but pmd_migration_entry_wait() grows by 14 bytes. It removes the assumption from pmd_migration_entry_wait() that pages cannot be larger than a PMD (which is true today, but may be interesting to explore in the future). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski --- include/linux/pagemap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 605246452305..841f7ba62d7d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -868,7 +868,7 @@ static inline int wait_on_page_locked_killable(struct page *page) return folio_wait_locked_killable(page_folio(page)); } -int put_and_wait_on_page_locked(struct page *page, int state); +int folio_put_wait_locked(struct folio *folio, int state); void wait_on_page_writeback(struct page *page); void folio_wait_writeback(struct folio *folio); int folio_wait_writeback_killable(struct folio *folio); -- cgit v1.2.3 From 621db4880d305bc37b343b1671e03b7eb5d61389 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 8 May 2021 20:04:05 -0400 Subject: filemap: Add filemap_unaccount_folio() Replace unaccount_page_cache_page() with filemap_unaccount_folio(). The bug handling path could be a bit more robust (eg taking into account the mapcounts of tail pages), but it's really never supposed to happen. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski --- include/linux/pagemap.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 841f7ba62d7d..077b6f378666 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -884,11 +884,6 @@ static inline void __set_page_dirty(struct page *page, } void folio_account_cleaned(struct folio *folio, struct address_space *mapping, struct bdi_writeback *wb); -static inline void account_page_cleaned(struct page *page, - struct address_space *mapping, struct bdi_writeback *wb) -{ - return folio_account_cleaned(page_folio(page), mapping, wb); -} void __folio_cancel_dirty(struct folio *folio); static inline void folio_cancel_dirty(struct folio *folio) { -- cgit v1.2.3 From 452e9e6992fe058a650c81d01a9982e3faf10278 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 9 May 2021 09:33:42 -0400 Subject: filemap: Add filemap_remove_folio and __filemap_remove_folio Reimplement __delete_from_page_cache() as a wrapper around __filemap_remove_folio() and delete_from_page_cache() as a wrapper around filemap_remove_folio(). Remove the EXPORT_SYMBOL as delete_from_page_cache() was not used by any in-tree modules. Convert page_cache_free_page() into filemap_free_folio(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski --- include/linux/pagemap.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 077b6f378666..3f26b191ede3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -930,8 +930,13 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp); int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp); -extern void delete_from_page_cache(struct page *page); -extern void __delete_from_page_cache(struct page *page, void *shadow); +void filemap_remove_folio(struct folio *folio); +void delete_from_page_cache(struct page *page); +void __filemap_remove_folio(struct folio *folio, void *shadow); +static inline void __delete_from_page_cache(struct page *page, void *shadow) +{ + __filemap_remove_folio(page_folio(page), shadow); +} void replace_page_cache_page(struct page *old, struct page *new); void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec); -- cgit v1.2.3 From bb2e98b613a3c76c904dfa82eb4b86773817598b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 28 Nov 2021 16:14:50 -0500 Subject: filemap: Remove thp_contains() This function is now unused, so delete it. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski --- include/linux/pagemap.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 3f26b191ede3..8c2cad7f0c36 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -512,15 +512,6 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } -/* Does this page contain this index? */ -static inline bool thp_contains(struct page *head, pgoff_t index) -{ - /* HugeTLBfs indexes the page cache in units of hpage_size */ - if (PageHuge(head)) - return head->index == index; - return page_index(head) == (index & ~(thp_nr_pages(head) - 1UL)); -} - #define swapcache_index(folio) __page_file_index(&(folio)->page) /** -- cgit v1.2.3 From 7836d9990079ed611199819ccf487061b748193a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 27 May 2021 12:30:54 -0400 Subject: readahead: Convert page_cache_async_ra() to take a folio Using the folio here avoids checking whether it's a tail page. This patch mostly just enables some of the following patches. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski --- include/linux/pagemap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 8c2cad7f0c36..30302be6977f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -993,7 +993,7 @@ struct readahead_control { void page_cache_ra_unbounded(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_count); void page_cache_sync_ra(struct readahead_control *, unsigned long req_count); -void page_cache_async_ra(struct readahead_control *, struct page *, +void page_cache_async_ra(struct readahead_control *, struct folio *, unsigned long req_count); void readahead_expand(struct readahead_control *ractl, loff_t new_start, size_t new_len); @@ -1040,7 +1040,7 @@ void page_cache_async_readahead(struct address_space *mapping, struct page *page, pgoff_t index, unsigned long req_count) { DEFINE_READAHEAD(ractl, file, ra, mapping, index); - page_cache_async_ra(&ractl, page, req_count); + page_cache_async_ra(&ractl, page_folio(page), req_count); } static inline struct folio *__readahead_folio(struct readahead_control *ractl) -- cgit v1.2.3 From 539a3322f208db478db88c4a76239476defce6b1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Dec 2020 11:45:30 -0500 Subject: filemap: Add read_cache_folio and read_mapping_folio Reimplement read_cache_page() as a wrapper around read_cache_folio(). Saves over 400 bytes of text from do_read_cache_folio() which more than makes up for the extra 100 bytes of text added to the various wrapper functions. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski --- include/linux/pagemap.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 30302be6977f..7bef50ea5435 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -629,8 +629,10 @@ static inline struct page *grab_cache_page(struct address_space *mapping, return find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); } -extern struct page * read_cache_page(struct address_space *mapping, - pgoff_t index, filler_t *filler, void *data); +struct folio *read_cache_folio(struct address_space *, pgoff_t index, + filler_t *filler, void *data); +struct page *read_cache_page(struct address_space *, pgoff_t index, + filler_t *filler, void *data); extern struct page * read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern int read_cache_pages(struct address_space *mapping, @@ -642,6 +644,12 @@ static inline struct page *read_mapping_page(struct address_space *mapping, return read_cache_page(mapping, index, NULL, data); } +static inline struct folio *read_mapping_folio(struct address_space *mapping, + pgoff_t index, void *data) +{ + return read_cache_folio(mapping, index, NULL, data); +} + /* * Get index of the page within radix-tree (but not for hugetlb pages). * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE) -- cgit v1.2.3 From 82c50f8b443359ec99348cd9b1289f55cd47779d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 28 Jul 2021 15:14:48 -0400 Subject: filemap: Add filemap_release_folio() Reimplement try_to_release_page() as a wrapper around filemap_release_folio(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski --- include/linux/mm.h | 1 - include/linux/pagemap.h | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 72ca04f16711..145f045b0ddc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1970,7 +1970,6 @@ int get_kernel_pages(const struct kvec *iov, int nr_pages, int write, struct page **pages); struct page *get_dump_page(unsigned long addr); -extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned int offset, unsigned int length); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 7bef50ea5435..eb6e58e106c8 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -939,6 +939,8 @@ static inline void __delete_from_page_cache(struct page *page, void *shadow) void replace_page_cache_page(struct page *old, struct page *new); void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec); +int try_to_release_page(struct page *page, gfp_t gfp); +bool filemap_release_folio(struct folio *folio, gfp_t gfp); loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end, int whence); -- cgit v1.2.3 From 77e2a04745ff8e391ad402e2d2d1157a5d3a7ebc Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Tue, 4 Jan 2022 19:51:08 +0000 Subject: ACPI: PCC: Implement OperationRegion handler for the PCC Type 3 subtype PCC OpRegion provides a mechanism to communicate with the platform directly from the AML. PCCT provides the list of PCC channel available in the platform, a subset or all of them can be used in PCC Opregion. This patch registers the PCC OpRegion handler before ACPI tables are loaded. This relies on the special context data passed to identify and set up the PCC channel before the OpRegion handler is executed for the first time. Typical PCC Opregion declaration looks like this: OperationRegion (PFRM, PCC, 2, 0x74) Field (PFRM, ByteAcc, NoLock, Preserve) { SIGN, 32, FLGS, 32, LEN, 32, CMD, 32, DATA, 800 } It contains four named double words followed by 100 bytes of buffer names DATA. ASL can fill out the buffer something like: /* Create global or local buffer */ Name (BUFF, Buffer (0x0C){}) /* Create double word fields over the buffer */ CreateDWordField (BUFF, 0x0, WD0) CreateDWordField (BUFF, 0x04, WD1) CreateDWordField (BUFF, 0x08, WD2) /* Fill the named fields */ WD0 = 0x50434300 SIGN = BUFF WD0 = 1 FLGS = BUFF WD0 = 0x10 LEN = BUFF /* Fill the payload in the DATA buffer */ WD0 = 0 WD1 = 0x08 WD2 = 0 DATA = BUFF /* Write to CMD field to trigger handler */ WD0 = 0x4404 CMD = BUFF This buffer is received by acpi_pcc_opregion_space_handler. This handler will fetch the complete buffer via internal_pcc_buffer. The setup handler will receive the special PCC context data which will contain the PCC channel index which used to set up the channel. The buffer pointer and length is saved in region context which is then used in the handler. (kernel test robot: Build failure with CONFIG_ACPI_DEBUGGER) Link: https://lore.kernel.org/r/202201041539.feAV0l27-lkp@intel.com Reported-by: kernel test robot Signed-off-by: Sudeep Holla Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index b28f8790192a..93eaba2485e3 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1389,6 +1389,12 @@ static inline int find_acpi_cpu_cache_topology(unsigned int cpu, int level) } #endif +#ifdef CONFIG_ACPI_PCC +void acpi_init_pcc(void); +#else +static inline void acpi_init_pcc(void) { } +#endif + #ifdef CONFIG_ACPI extern void acpi_device_notify(struct device *dev); extern void acpi_device_notify_remove(struct device *dev); -- cgit v1.2.3 From 742bef476ca5352b16063161fb73a56629a6d995 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 21 Dec 2021 08:20:35 +0100 Subject: ata: libata: move ata_{port,link,dev}_dbg to standard pr_XXX() macros Use standard pr_{debug,info,notice,warn,err} macros instead of the hand-crafted printk helpers. Signed-off-by: Hannes Reinecke Signed-off-by: Damien Le Moal --- include/linux/libata.h | 69 +++++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 235fdbeb19ea..39cdde0b9491 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1489,51 +1489,61 @@ static inline int sata_srst_pmp(struct ata_link *link) return link->pmp; } -/* - * printk helpers - */ -__printf(3, 4) -void ata_port_printk(const struct ata_port *ap, const char *level, - const char *fmt, ...); -__printf(3, 4) -void ata_link_printk(const struct ata_link *link, const char *level, - const char *fmt, ...); -__printf(3, 4) -void ata_dev_printk(const struct ata_device *dev, const char *level, - const char *fmt, ...); +#define ata_port_printk(level, ap, fmt, ...) \ + pr_ ## level ("ata%u: " fmt, (ap)->print_id, ##__VA_ARGS__) #define ata_port_err(ap, fmt, ...) \ - ata_port_printk(ap, KERN_ERR, fmt, ##__VA_ARGS__) + ata_port_printk(err, ap, fmt, ##__VA_ARGS__) #define ata_port_warn(ap, fmt, ...) \ - ata_port_printk(ap, KERN_WARNING, fmt, ##__VA_ARGS__) + ata_port_printk(warn, ap, fmt, ##__VA_ARGS__) #define ata_port_notice(ap, fmt, ...) \ - ata_port_printk(ap, KERN_NOTICE, fmt, ##__VA_ARGS__) + ata_port_printk(notice, ap, fmt, ##__VA_ARGS__) #define ata_port_info(ap, fmt, ...) \ - ata_port_printk(ap, KERN_INFO, fmt, ##__VA_ARGS__) + ata_port_printk(info, ap, fmt, ##__VA_ARGS__) #define ata_port_dbg(ap, fmt, ...) \ - ata_port_printk(ap, KERN_DEBUG, fmt, ##__VA_ARGS__) + ata_port_printk(debug, ap, fmt, ##__VA_ARGS__) + +#define ata_link_printk(level, link, fmt, ...) \ +do { \ + if (sata_pmp_attached((link)->ap) || \ + (link)->ap->slave_link) \ + pr_ ## level ("ata%u.%02u: " fmt, \ + (link)->ap->print_id, \ + (link)->pmp, \ + ##__VA_ARGS__); \ + else \ + pr_ ## level ("ata%u: " fmt, \ + (link)->ap->print_id, \ + ##__VA_ARGS__); \ +} while (0) #define ata_link_err(link, fmt, ...) \ - ata_link_printk(link, KERN_ERR, fmt, ##__VA_ARGS__) + ata_link_printk(err, link, fmt, ##__VA_ARGS__) #define ata_link_warn(link, fmt, ...) \ - ata_link_printk(link, KERN_WARNING, fmt, ##__VA_ARGS__) + ata_link_printk(warn, link, fmt, ##__VA_ARGS__) #define ata_link_notice(link, fmt, ...) \ - ata_link_printk(link, KERN_NOTICE, fmt, ##__VA_ARGS__) + ata_link_printk(notice, link, fmt, ##__VA_ARGS__) #define ata_link_info(link, fmt, ...) \ - ata_link_printk(link, KERN_INFO, fmt, ##__VA_ARGS__) + ata_link_printk(info, link, fmt, ##__VA_ARGS__) #define ata_link_dbg(link, fmt, ...) \ - ata_link_printk(link, KERN_DEBUG, fmt, ##__VA_ARGS__) + ata_link_printk(debug, link, fmt, ##__VA_ARGS__) + +#define ata_dev_printk(level, dev, fmt, ...) \ + pr_ ## level("ata%u.%02u: " fmt, \ + (dev)->link->ap->print_id, \ + (dev)->link->pmp + (dev)->devno, \ + ##__VA_ARGS__) #define ata_dev_err(dev, fmt, ...) \ - ata_dev_printk(dev, KERN_ERR, fmt, ##__VA_ARGS__) + ata_dev_printk(err, dev, fmt, ##__VA_ARGS__) #define ata_dev_warn(dev, fmt, ...) \ - ata_dev_printk(dev, KERN_WARNING, fmt, ##__VA_ARGS__) + ata_dev_printk(warn, dev, fmt, ##__VA_ARGS__) #define ata_dev_notice(dev, fmt, ...) \ - ata_dev_printk(dev, KERN_NOTICE, fmt, ##__VA_ARGS__) + ata_dev_printk(notice, dev, fmt, ##__VA_ARGS__) #define ata_dev_info(dev, fmt, ...) \ - ata_dev_printk(dev, KERN_INFO, fmt, ##__VA_ARGS__) + ata_dev_printk(info, dev, fmt, ##__VA_ARGS__) #define ata_dev_dbg(dev, fmt, ...) \ - ata_dev_printk(dev, KERN_DEBUG, fmt, ##__VA_ARGS__) + ata_dev_printk(debug, dev, fmt, ##__VA_ARGS__) void ata_print_version(const struct device *dev, const char *version); @@ -2067,11 +2077,8 @@ static inline u8 ata_wait_idle(struct ata_port *ap) { u8 status = ata_sff_busy_wait(ap, ATA_BUSY | ATA_DRQ, 1000); -#ifdef ATA_DEBUG if (status != 0xff && (status & (ATA_BUSY | ATA_DRQ))) - ata_port_printk(ap, KERN_DEBUG, "abnormal Status 0x%X\n", - status); -#endif + ata_port_dbg(ap, "abnormal Status 0x%X\n", status); return status; } -- cgit v1.2.3 From d97c75edd806669c9f4b56c0ddae37725c0b708c Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 21 Dec 2021 08:21:01 +0100 Subject: ata: libata: drop ata_msg_error() and ata_msg_intr() Unused. Signed-off-by: Hannes Reinecke Signed-off-by: Damien Le Moal --- include/linux/libata.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 39cdde0b9491..4f0a85f4e69a 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -78,8 +78,6 @@ enum { ATA_MSG_WARN = 0x0008, ATA_MSG_MALLOC = 0x0010, ATA_MSG_CTL = 0x0020, - ATA_MSG_INTR = 0x0040, - ATA_MSG_ERR = 0x0080, }; #define ata_msg_drv(p) ((p)->msg_enable & ATA_MSG_DRV) @@ -88,8 +86,6 @@ enum { #define ata_msg_warn(p) ((p)->msg_enable & ATA_MSG_WARN) #define ata_msg_malloc(p) ((p)->msg_enable & ATA_MSG_MALLOC) #define ata_msg_ctl(p) ((p)->msg_enable & ATA_MSG_CTL) -#define ata_msg_intr(p) ((p)->msg_enable & ATA_MSG_INTR) -#define ata_msg_err(p) ((p)->msg_enable & ATA_MSG_ERR) static inline u32 ata_msg_init(int dval, int default_msg_enable_bits) { -- cgit v1.2.3 From 5cef96b4207e01c9cdb7752acaa178056fe94632 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 21 Dec 2021 08:21:02 +0100 Subject: ata: libata: drop ata_msg_ctl() The one caller have been converted to dynamic debugging. Signed-off-by: Hannes Reinecke Signed-off-by: Damien Le Moal --- include/linux/libata.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 4f0a85f4e69a..e384cce62963 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -77,7 +77,6 @@ enum { ATA_MSG_PROBE = 0x0004, ATA_MSG_WARN = 0x0008, ATA_MSG_MALLOC = 0x0010, - ATA_MSG_CTL = 0x0020, }; #define ata_msg_drv(p) ((p)->msg_enable & ATA_MSG_DRV) @@ -85,7 +84,6 @@ enum { #define ata_msg_probe(p) ((p)->msg_enable & ATA_MSG_PROBE) #define ata_msg_warn(p) ((p)->msg_enable & ATA_MSG_WARN) #define ata_msg_malloc(p) ((p)->msg_enable & ATA_MSG_MALLOC) -#define ata_msg_ctl(p) ((p)->msg_enable & ATA_MSG_CTL) static inline u32 ata_msg_init(int dval, int default_msg_enable_bits) { -- cgit v1.2.3 From 2f784b923d50cdef1f6bd24d7c18614321b0833a Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 21 Dec 2021 08:21:03 +0100 Subject: ata: libata: drop ata_msg_malloc() Unused. Signed-off-by: Hannes Reinecke Signed-off-by: Damien Le Moal --- include/linux/libata.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index e384cce62963..5651bbf4902b 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -76,14 +76,12 @@ enum { ATA_MSG_INFO = 0x0002, ATA_MSG_PROBE = 0x0004, ATA_MSG_WARN = 0x0008, - ATA_MSG_MALLOC = 0x0010, }; #define ata_msg_drv(p) ((p)->msg_enable & ATA_MSG_DRV) #define ata_msg_info(p) ((p)->msg_enable & ATA_MSG_INFO) #define ata_msg_probe(p) ((p)->msg_enable & ATA_MSG_PROBE) #define ata_msg_warn(p) ((p)->msg_enable & ATA_MSG_WARN) -#define ata_msg_malloc(p) ((p)->msg_enable & ATA_MSG_MALLOC) static inline u32 ata_msg_init(int dval, int default_msg_enable_bits) { -- cgit v1.2.3 From 16d424672716dc886fb58ec4a47a408db4781cc0 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 21 Dec 2021 08:21:04 +0100 Subject: ata: libata: drop ata_msg_warn() The WARN level was always enabled, so drop ata_msg_warn(). Signed-off-by: Hannes Reinecke Signed-off-by: Damien Le Moal --- include/linux/libata.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 5651bbf4902b..0e5ed2ff94be 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -75,13 +75,11 @@ enum { ATA_MSG_DRV = 0x0001, ATA_MSG_INFO = 0x0002, ATA_MSG_PROBE = 0x0004, - ATA_MSG_WARN = 0x0008, }; #define ata_msg_drv(p) ((p)->msg_enable & ATA_MSG_DRV) #define ata_msg_info(p) ((p)->msg_enable & ATA_MSG_INFO) #define ata_msg_probe(p) ((p)->msg_enable & ATA_MSG_PROBE) -#define ata_msg_warn(p) ((p)->msg_enable & ATA_MSG_WARN) static inline u32 ata_msg_init(int dval, int default_msg_enable_bits) { -- cgit v1.2.3 From 17a1e1be2fc7dc99945b41df0485037dcb6044d0 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 21 Dec 2021 08:21:05 +0100 Subject: ata: libata: drop ata_msg_probe() All callsites have been converted to dynamic debugging. Signed-off-by: Hannes Reinecke Signed-off-by: Damien Le Moal --- include/linux/libata.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 0e5ed2ff94be..455d7e77e562 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -74,12 +74,10 @@ enum { ATA_MSG_DRV = 0x0001, ATA_MSG_INFO = 0x0002, - ATA_MSG_PROBE = 0x0004, }; #define ata_msg_drv(p) ((p)->msg_enable & ATA_MSG_DRV) #define ata_msg_info(p) ((p)->msg_enable & ATA_MSG_INFO) -#define ata_msg_probe(p) ((p)->msg_enable & ATA_MSG_PROBE) static inline u32 ata_msg_init(int dval, int default_msg_enable_bits) { -- cgit v1.2.3 From 96c810f216cb6da15bfa8fe8ef3bf73ca91c5dd8 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 21 Dec 2021 08:21:06 +0100 Subject: ata: libata: drop ata_msg_info() Convert the sole caller to ata_dev_dbg() and remove the definition. Signed-off-by: Hannes Reinecke Signed-off-by: Damien Le Moal --- include/linux/libata.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 455d7e77e562..524d09b1dc82 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -73,11 +73,9 @@ enum { ATA_MSG_DRV = 0x0001, - ATA_MSG_INFO = 0x0002, }; #define ata_msg_drv(p) ((p)->msg_enable & ATA_MSG_DRV) -#define ata_msg_info(p) ((p)->msg_enable & ATA_MSG_INFO) static inline u32 ata_msg_init(int dval, int default_msg_enable_bits) { -- cgit v1.2.3 From 1c95a27c1e544f723f6e0e5a4384098f92996ec0 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 21 Dec 2021 08:21:07 +0100 Subject: ata: libata: drop ata_msg_drv() Callers are already protected by ata_dev_print_info(), so no need to have an additional configuration parameter here. Signed-off-by: Hannes Reinecke Signed-off-by: Damien Le Moal --- include/linux/libata.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 524d09b1dc82..65172609a005 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -71,12 +71,6 @@ /* NEW: debug levels */ #define HAVE_LIBATA_MSG 1 -enum { - ATA_MSG_DRV = 0x0001, -}; - -#define ata_msg_drv(p) ((p)->msg_enable & ATA_MSG_DRV) - static inline u32 ata_msg_init(int dval, int default_msg_enable_bits) { if (dval < 0 || dval >= (sizeof(u32) * 8)) -- cgit v1.2.3 From db45905e74e6ae035305719bc683eca40f526669 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 21 Dec 2021 08:21:08 +0100 Subject: ata: libata: remove 'new' ata message handling Remove the remaining bits for the 'new' ata message handling. Signed-off-by: Hannes Reinecke Signed-off-by: Damien Le Moal --- include/linux/libata.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 65172609a005..145c0132b75e 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -68,18 +68,6 @@ } \ }) -/* NEW: debug levels */ -#define HAVE_LIBATA_MSG 1 - -static inline u32 ata_msg_init(int dval, int default_msg_enable_bits) -{ - if (dval < 0 || dval >= (sizeof(u32) * 8)) - return default_msg_enable_bits; /* should be 0x1 - only driver info msgs */ - if (!dval) - return 0; - return (1 << dval) - 1; -} - /* defines only for the constants which don't work well as enums */ #define ATA_TAG_POISON 0xfafbfcfdU @@ -864,7 +852,6 @@ struct ata_port { unsigned int hsm_task_state; - u32 msg_enable; struct list_head eh_done_q; wait_queue_head_t eh_wait_q; int eh_tries; -- cgit v1.2.3 From 870bb833c0acb29d8471eac5c2d2e6274826dbb6 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 21 Dec 2021 08:21:09 +0100 Subject: ata: libata: remove debug compilation switches Unused now, so remove and drop any references to them. Signed-off-by: Hannes Reinecke Signed-off-by: Damien Le Moal --- include/linux/libata.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 145c0132b75e..c258f69106f4 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -39,25 +39,9 @@ * compile-time options: to be removed as soon as all the drivers are * converted to the new debugging mechanism */ -#undef ATA_DEBUG /* debugging output */ -#undef ATA_VERBOSE_DEBUG /* yet more debugging output */ #undef ATA_IRQ_TRAP /* define to ack screaming irqs */ -#undef ATA_NDEBUG /* define to disable quick runtime checks */ -/* note: prints function name for you */ -#ifdef ATA_DEBUG -#define DPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args) -#ifdef ATA_VERBOSE_DEBUG -#define VPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args) -#else -#define VPRINTK(fmt, args...) -#endif /* ATA_VERBOSE_DEBUG */ -#else -#define DPRINTK(fmt, args...) -#define VPRINTK(fmt, args...) -#endif /* ATA_DEBUG */ - #define ata_print_version_once(dev, version) \ ({ \ static bool __print_once; \ -- cgit v1.2.3 From cc4b08c31b5c51352f258032cc65e884b3e61e6a Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 7 Dec 2021 21:15:31 +0900 Subject: can: do not increase tx_bytes statistics for RTR frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The actual payload length of the CAN Remote Transmission Request (RTR) frames is always 0, i.e. no payload is transmitted on the wire. However, those RTR frames still use the DLC to indicate the length of the requested frame. As such, net_device_stats::tx_bytes should not be increased when sending RTR frames. The function can_get_echo_skb() already returns the correct length, even for RTR frames (c.f. [1]). However, for historical reasons, the drivers do not use can_get_echo_skb()'s return value and instead, most of them store a temporary length (or dlc) in some local structure or array. Using the return value of can_get_echo_skb() solves the issue. After doing this, such length/dlc fields become unused and so this patch does the adequate cleaning when needed. This patch fixes all the CAN drivers. Finally, can_get_echo_skb() is decorated with the __must_check attribute in order to force future drivers to correctly use its return value (else the compiler would emit a warning). [1] commit ed3320cec279 ("can: dev: __can_get_echo_skb(): fix real payload length return value for RTR frames") Link: https://lore.kernel.org/all/20211207121531.42941-6-mailhol.vincent@wanadoo.fr Cc: Nicolas Ferre Cc: Alexandre Belloni Cc: Ludovic Desroches Cc: Maxime Ripard Cc: Chen-Yu Tsai Cc: Jernej Skrabec Cc: Yasushi SHOJI Cc: Oliver Hartkopp Cc: Stephane Grosjean Cc: Andreas Larsson Tested-by: Jimmy Assarsson # kvaser Signed-off-by: Vincent Mailhol Acked-by: Stefan Mätje # esd_usb2 Tested-by: Stefan Mätje # esd_usb2 [mkl: add conversion for grcan] Signed-off-by: Marc Kleine-Budde --- include/linux/can/skb.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index d311bc369a39..fdb22b00674a 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -21,8 +21,9 @@ int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, unsigned int idx, unsigned int frame_len); struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr, unsigned int *frame_len_ptr); -unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx, - unsigned int *frame_len_ptr); +unsigned int __must_check can_get_echo_skb(struct net_device *dev, + unsigned int idx, + unsigned int *frame_len_ptr); void can_free_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr); struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf); -- cgit v1.2.3 From c9e1d8ed304cc6106c3241add170193995953325 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 14 Dec 2021 01:02:23 +0900 Subject: can: dev: replace can_priv::ctrlmode_static by can_get_static_ctrlmode() The statically enabled features of a CAN controller can be retrieved using below formula: | u32 ctrlmode_static = priv->ctrlmode & ~priv->ctrlmode_supported; As such, there is no need to store this information. This patch remove the field ctrlmode_static of struct can_priv and provides, in replacement, the inline function can_get_static_ctrlmode() which returns the same value. Link: https://lore.kernel.org/all/20211213160226.56219-2-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 45f19d9db5ca..92e2d69462f0 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -69,7 +69,6 @@ struct can_priv { /* CAN controller features - see include/uapi/linux/can/netlink.h */ u32 ctrlmode; /* current options setting */ u32 ctrlmode_supported; /* options that can be modified by netlink */ - u32 ctrlmode_static; /* static enabled options for driver/hardware */ int restart_ms; struct delayed_work restart_work; @@ -139,13 +138,17 @@ static inline void can_set_static_ctrlmode(struct net_device *dev, /* alloc_candev() succeeded => netdev_priv() is valid at this point */ priv->ctrlmode = static_mode; - priv->ctrlmode_static = static_mode; /* override MTU which was set by default in can_setup()? */ if (static_mode & CAN_CTRLMODE_FD) dev->mtu = CANFD_MTU; } +static inline u32 can_get_static_ctrlmode(struct can_priv *priv) +{ + return priv->ctrlmode & ~priv->ctrlmode_supported; +} + void can_setup(struct net_device *dev); struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, -- cgit v1.2.3 From 7d4a101c0bd3c6e5c6e45c705a54f7bc8f6c128d Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 14 Dec 2021 01:02:24 +0900 Subject: can: dev: add sanity check in can_set_static_ctrlmode() Previous patch removed can_priv::ctrlmode_static to replace it with can_get_static_ctrlmode(). A condition sine qua non for this to work is that the controller static modes should never be set in can_priv::ctrlmode_supported (c.f. the comment on can_priv::ctrlmode_supported which states that it is for "options that can be *modified* by netlink"). Also, this condition is already correctly fulfilled by all existing drivers which rely on the ctrlmode_static feature. Nonetheless, we added an extra safeguard in can_set_static_ctrlmode() to return an error value and to warn the developer who would be adventurous enough to set to static a given feature that is already set to supported. The drivers which rely on the static controller mode are then updated to check the return value of can_set_static_ctrlmode(). Link: https://lore.kernel.org/all/20211213160226.56219-3-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 92e2d69462f0..fff3f70df697 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -131,17 +131,24 @@ static inline s32 can_get_relative_tdco(const struct can_priv *priv) } /* helper to define static CAN controller features at device creation time */ -static inline void can_set_static_ctrlmode(struct net_device *dev, - u32 static_mode) +static inline int __must_check can_set_static_ctrlmode(struct net_device *dev, + u32 static_mode) { struct can_priv *priv = netdev_priv(dev); /* alloc_candev() succeeded => netdev_priv() is valid at this point */ + if (priv->ctrlmode_supported & static_mode) { + netdev_warn(dev, + "Controller features can not be supported and static at the same time\n"); + return -EINVAL; + } priv->ctrlmode = static_mode; /* override MTU which was set by default in can_setup()? */ if (static_mode & CAN_CTRLMODE_FD) dev->mtu = CANFD_MTU; + + return 0; } static inline u32 can_get_static_ctrlmode(struct can_priv *priv) -- cgit v1.2.3 From 5fe1be81efd28bf2afcac218f23118dca6b1b648 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 14 Dec 2021 01:02:25 +0900 Subject: can: dev: reorder struct can_priv members for better packing Save eight bytes of holes on x86-64 architectures by reordering the members of struct can_priv. Before: | $ pahole -C can_priv drivers/net/can/dev/dev.o | struct can_priv { | struct net_device * dev; /* 0 8 */ | struct can_device_stats can_stats; /* 8 24 */ | const struct can_bittiming_const * bittiming_const; /* 32 8 */ | const struct can_bittiming_const * data_bittiming_const; /* 40 8 */ | struct can_bittiming bittiming; /* 48 32 */ | /* --- cacheline 1 boundary (64 bytes) was 16 bytes ago --- */ | struct can_bittiming data_bittiming; /* 80 32 */ | const struct can_tdc_const * tdc_const; /* 112 8 */ | struct can_tdc tdc; /* 120 12 */ | /* --- cacheline 2 boundary (128 bytes) was 4 bytes ago --- */ | unsigned int bitrate_const_cnt; /* 132 4 */ | const u32 * bitrate_const; /* 136 8 */ | const u32 * data_bitrate_const; /* 144 8 */ | unsigned int data_bitrate_const_cnt; /* 152 4 */ | u32 bitrate_max; /* 156 4 */ | struct can_clock clock; /* 160 4 */ | unsigned int termination_const_cnt; /* 164 4 */ | const u16 * termination_const; /* 168 8 */ | u16 termination; /* 176 2 */ | | /* XXX 6 bytes hole, try to pack */ | | struct gpio_desc * termination_gpio; /* 184 8 */ | /* --- cacheline 3 boundary (192 bytes) --- */ | u16 termination_gpio_ohms[2]; /* 192 4 */ | enum can_state state; /* 196 4 */ | u32 ctrlmode; /* 200 4 */ | u32 ctrlmode_supported; /* 204 4 */ | int restart_ms; /* 208 4 */ | | /* XXX 4 bytes hole, try to pack */ | | struct delayed_work restart_work; /* 216 88 */ | | /* XXX last struct has 4 bytes of padding */ | | /* --- cacheline 4 boundary (256 bytes) was 48 bytes ago --- */ | int (*do_set_bittiming)(struct net_device *); /* 304 8 */ | int (*do_set_data_bittiming)(struct net_device *); /* 312 8 */ | /* --- cacheline 5 boundary (320 bytes) --- */ | int (*do_set_mode)(struct net_device *, enum can_mode); /* 320 8 */ | int (*do_set_termination)(struct net_device *, u16); /* 328 8 */ | int (*do_get_state)(const struct net_device *, enum can_state *); /* 336 8 */ | int (*do_get_berr_counter)(const struct net_device *, struct can_berr_counter *); /* 344 8 */ | unsigned int echo_skb_max; /* 352 4 */ | | /* XXX 4 bytes hole, try to pack */ | | struct sk_buff * * echo_skb; /* 360 8 */ | | /* size: 368, cachelines: 6, members: 32 */ | /* sum members: 354, holes: 3, sum holes: 14 */ | /* paddings: 1, sum paddings: 4 */ | /* last cacheline: 48 bytes */ | }; After: | $ pahole -C can_priv drivers/net/can/dev/dev.o | struct can_priv { | struct net_device * dev; /* 0 8 */ | struct can_device_stats can_stats; /* 8 24 */ | const struct can_bittiming_const * bittiming_const; /* 32 8 */ | const struct can_bittiming_const * data_bittiming_const; /* 40 8 */ | struct can_bittiming bittiming; /* 48 32 */ | /* --- cacheline 1 boundary (64 bytes) was 16 bytes ago --- */ | struct can_bittiming data_bittiming; /* 80 32 */ | const struct can_tdc_const * tdc_const; /* 112 8 */ | struct can_tdc tdc; /* 120 12 */ | /* --- cacheline 2 boundary (128 bytes) was 4 bytes ago --- */ | unsigned int bitrate_const_cnt; /* 132 4 */ | const u32 * bitrate_const; /* 136 8 */ | const u32 * data_bitrate_const; /* 144 8 */ | unsigned int data_bitrate_const_cnt; /* 152 4 */ | u32 bitrate_max; /* 156 4 */ | struct can_clock clock; /* 160 4 */ | unsigned int termination_const_cnt; /* 164 4 */ | const u16 * termination_const; /* 168 8 */ | u16 termination; /* 176 2 */ | | /* XXX 6 bytes hole, try to pack */ | | struct gpio_desc * termination_gpio; /* 184 8 */ | /* --- cacheline 3 boundary (192 bytes) --- */ | u16 termination_gpio_ohms[2]; /* 192 4 */ | unsigned int echo_skb_max; /* 196 4 */ | struct sk_buff * * echo_skb; /* 200 8 */ | enum can_state state; /* 208 4 */ | u32 ctrlmode; /* 212 4 */ | u32 ctrlmode_supported; /* 216 4 */ | int restart_ms; /* 220 4 */ | struct delayed_work restart_work; /* 224 88 */ | | /* XXX last struct has 4 bytes of padding */ | | /* --- cacheline 4 boundary (256 bytes) was 56 bytes ago --- */ | int (*do_set_bittiming)(struct net_device *); /* 312 8 */ | /* --- cacheline 5 boundary (320 bytes) --- */ | int (*do_set_data_bittiming)(struct net_device *); /* 320 8 */ | int (*do_set_mode)(struct net_device *, enum can_mode); /* 328 8 */ | int (*do_set_termination)(struct net_device *, u16); /* 336 8 */ | int (*do_get_state)(const struct net_device *, enum can_state *); /* 344 8 */ | int (*do_get_berr_counter)(const struct net_device *, struct can_berr_counter *); /* 352 8 */ | | /* size: 360, cachelines: 6, members: 32 */ | /* sum members: 354, holes: 1, sum holes: 6 */ | /* paddings: 1, sum paddings: 4 */ | /* last cacheline: 40 bytes */ | }; Link: https://lore.kernel.org/all/20211213160226.56219-4-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index fff3f70df697..c2ea47f30046 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -64,6 +64,9 @@ struct can_priv { struct gpio_desc *termination_gpio; u16 termination_gpio_ohms[CAN_TERMINATION_GPIO_MAX]; + unsigned int echo_skb_max; + struct sk_buff **echo_skb; + enum can_state state; /* CAN controller features - see include/uapi/linux/can/netlink.h */ @@ -83,9 +86,6 @@ struct can_priv { struct can_berr_counter *bec); int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv); - unsigned int echo_skb_max; - struct sk_buff **echo_skb; - #ifdef CONFIG_CAN_LEDS struct led_trigger *tx_led_trig; char tx_led_trig_name[CAN_LED_NAME_SZ]; -- cgit v1.2.3 From c6af53f038aa32cec12e8a305ba07c7ef168f1b0 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 4 Jan 2022 12:07:00 +0000 Subject: net: mdio: add helpers to extract clause 45 regad and devad fields Add a couple of helpers and definitions to extract the clause 45 regad and devad fields from the regnum passed into MDIO drivers. Tested-by: Daniel Golle Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Signed-off-by: Daniel Golle Signed-off-by: David S. Miller --- include/linux/mdio.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 9f3587a61e14..ecac96d52e01 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -7,6 +7,7 @@ #define __LINUX_MDIO_H__ #include +#include #include /* Or MII_ADDR_C45 into regnum for read/write on mii_bus to enable the 21 bit @@ -14,6 +15,7 @@ */ #define MII_ADDR_C45 (1<<30) #define MII_DEVADDR_C45_SHIFT 16 +#define MII_DEVADDR_C45_MASK GENMASK(20, 16) #define MII_REGADDR_C45_MASK GENMASK(15, 0) struct gpio_desc; @@ -381,6 +383,16 @@ static inline u32 mdiobus_c45_addr(int devad, u16 regnum) return MII_ADDR_C45 | devad << MII_DEVADDR_C45_SHIFT | regnum; } +static inline u16 mdiobus_c45_regad(u32 regnum) +{ + return FIELD_GET(MII_REGADDR_C45_MASK, regnum); +} + +static inline u16 mdiobus_c45_devad(u32 regnum) +{ + return FIELD_GET(MII_DEVADDR_C45_MASK, regnum); +} + static inline int __mdiobus_c45_read(struct mii_bus *bus, int prtad, int devad, u16 regnum) { -- cgit v1.2.3 From 01ec4a2e8f01f027a0f06cad237c935da8d643bf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 3 Jul 2021 00:23:39 +0200 Subject: headers/deps: USB: Optimize dependencies, remove The header is used over 1,400 times in a typical distro build, but few of its users actually need the full header. -------------------------------------------------------------------- | Combined, preprocessed C code size of header, without line markers, | with comments stripped: ------------------------- before: | #include | LOC: 7,078 | headers: 172 after: | #include | LOC: 812 | headers: 38 Remove it and add it to the places that need it. Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/ch9.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/ch9.h b/include/linux/usb/ch9.h index 1cffa34740b0..969e7dba6358 100644 --- a/include/linux/usb/ch9.h +++ b/include/linux/usb/ch9.h @@ -33,7 +33,6 @@ #ifndef __LINUX_USB_CH9_H #define __LINUX_USB_CH9_H -#include #include /* USB 3.2 SuperSpeed Plus phy signaling rate generation and lane count */ @@ -45,6 +44,8 @@ enum usb_ssp_rate { USB_SSP_GEN_2x2, }; +struct device; + extern const char *usb_ep_type_string(int ep_type); extern const char *usb_speed_string(enum usb_device_speed speed); extern enum usb_device_speed usb_get_maximum_speed(struct device *dev); -- cgit v1.2.3 From edce22e19bfa86efa2522d041d6367f2f099e8ed Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 5 Jan 2022 09:05:15 -0800 Subject: block: move rq_list macros to blk-mq.h Move the request list macros to the header file that defines that struct they operate on. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220105170518.3181469-2-kbusch@kernel.org Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 29 +++++++++++++++++++++++++++++ include/linux/blkdev.h | 29 ----------------------------- 2 files changed, 29 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 550996cf419c..bf64b94cd64e 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -216,6 +216,35 @@ static inline unsigned short req_get_ioprio(struct request *req) #define rq_dma_dir(rq) \ (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) +#define rq_list_add(listptr, rq) do { \ + (rq)->rq_next = *(listptr); \ + *(listptr) = rq; \ +} while (0) + +#define rq_list_pop(listptr) \ +({ \ + struct request *__req = NULL; \ + if ((listptr) && *(listptr)) { \ + __req = *(listptr); \ + *(listptr) = __req->rq_next; \ + } \ + __req; \ +}) + +#define rq_list_peek(listptr) \ +({ \ + struct request *__req = NULL; \ + if ((listptr) && *(listptr)) \ + __req = *(listptr); \ + __req; \ +}) + +#define rq_list_for_each(listptr, pos) \ + for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) + +#define rq_list_next(rq) (rq)->rq_next +#define rq_list_empty(list) ((list) == (struct request *) NULL) + enum blk_eh_timer_return { BLK_EH_DONE, /* drivers has completed the command */ BLK_EH_RESET_TIMER, /* reset timer and try again */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 22746b2d6825..9c95df26fc26 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1339,33 +1339,4 @@ struct io_comp_batch { #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } -#define rq_list_add(listptr, rq) do { \ - (rq)->rq_next = *(listptr); \ - *(listptr) = rq; \ -} while (0) - -#define rq_list_pop(listptr) \ -({ \ - struct request *__req = NULL; \ - if ((listptr) && *(listptr)) { \ - __req = *(listptr); \ - *(listptr) = __req->rq_next; \ - } \ - __req; \ -}) - -#define rq_list_peek(listptr) \ -({ \ - struct request *__req = NULL; \ - if ((listptr) && *(listptr)) \ - __req = *(listptr); \ - __req; \ -}) - -#define rq_list_for_each(listptr, pos) \ - for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) - -#define rq_list_next(rq) (rq)->rq_next -#define rq_list_empty(list) ((list) == (struct request *) NULL) - #endif /* _LINUX_BLKDEV_H */ -- cgit v1.2.3 From 3764fd05e1f89530e2ee5cbff0b638f2b1141b90 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 5 Jan 2022 09:05:16 -0800 Subject: block: introduce rq_list_for_each_safe macro While iterating a list, a particular request may need to be removed for special handling. Provide an iterator that can safely handle that. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20220105170518.3181469-3-kbusch@kernel.org Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index bf64b94cd64e..1467f0fa2142 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -242,6 +242,10 @@ static inline unsigned short req_get_ioprio(struct request *req) #define rq_list_for_each(listptr, pos) \ for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) +#define rq_list_for_each_safe(listptr, pos, nxt) \ + for (pos = rq_list_peek((listptr)), nxt = rq_list_next(pos); \ + pos; pos = nxt, nxt = pos ? rq_list_next(pos) : NULL) + #define rq_list_next(rq) (rq)->rq_next #define rq_list_empty(list) ((list) == (struct request *) NULL) -- cgit v1.2.3 From d2528be7a8b09af9796a270debd14101a72bb552 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 5 Jan 2022 09:05:17 -0800 Subject: block: introduce rq_list_move When iterating a list, a particular request may need to be moved for special handling. Provide a helper function to achieve that so drivers don't need to reimplement rqlist manipulation. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220105170518.3181469-4-kbusch@kernel.org Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1467f0fa2142..f40a05ecca4a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -249,6 +249,23 @@ static inline unsigned short req_get_ioprio(struct request *req) #define rq_list_next(rq) (rq)->rq_next #define rq_list_empty(list) ((list) == (struct request *) NULL) +/** + * rq_list_move() - move a struct request from one list to another + * @src: The source list @rq is currently in + * @dst: The destination list that @rq will be appended to + * @rq: The request to move + * @prev: The request preceding @rq in @src (NULL if @rq is the head) + */ +static void inline rq_list_move(struct request **src, struct request **dst, + struct request *rq, struct request *prev) +{ + if (prev) + prev->rq_next = rq->rq_next; + else + *src = rq->rq_next; + rq_list_add(dst, rq); +} + enum blk_eh_timer_return { BLK_EH_DONE, /* drivers has completed the command */ BLK_EH_RESET_TIMER, /* reset timer and try again */ -- cgit v1.2.3 From d53ad5d8b218a885e95080d4d3d556b16b91b1b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 3 Jan 2022 16:08:09 +0100 Subject: xdp: Move conversion to xdp_frame out of map functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All map redirect functions except XSK maps convert xdp_buff to xdp_frame before enqueueing it. So move this conversion of out the map functions and into xdp_do_redirect(). This removes a bit of duplicated code, but more importantly it makes it possible to support caller-allocated xdp_frame structures, which will be added in a subsequent commit. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220103150812.87914-5-toke@redhat.com --- include/linux/bpf.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 26753139d5b4..6e947cd91152 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1669,17 +1669,17 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); struct btf *bpf_get_btf_vmlinux(void); /* Map specifics */ -struct xdp_buff; +struct xdp_frame; struct sk_buff; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; void __dev_flush(void); -int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, +int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx); -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx); -int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, +int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); @@ -1688,7 +1688,7 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, bool exclude_ingress); void __cpu_map_flush(void); -int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, +int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx); int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, struct sk_buff *skb); @@ -1866,26 +1866,26 @@ static inline void __dev_flush(void) { } -struct xdp_buff; +struct xdp_frame; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; static inline -int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, +int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; } static inline -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; } static inline -int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, +int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress) { return 0; @@ -1913,7 +1913,7 @@ static inline void __cpu_map_flush(void) } static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, - struct xdp_buff *xdp, + struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; -- cgit v1.2.3 From 1372d34ccf6dd480332b2bcb2fd59a2b9a0df415 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 3 Jan 2022 16:08:10 +0100 Subject: xdp: Add xdp_do_redirect_frame() for pre-computed xdp_frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an xdp_do_redirect_frame() variant which supports pre-computed xdp_frame structures. This will be used in bpf_prog_run() to avoid having to write to the xdp_frame structure when the XDP program doesn't modify the frame boundaries. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220103150812.87914-6-toke@redhat.com --- include/linux/filter.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 60eec80fa1d4..71fa57b88bfc 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1019,6 +1019,10 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *prog); +int xdp_do_redirect_frame(struct net_device *dev, + struct xdp_buff *xdp, + struct xdp_frame *xdpf, + struct bpf_prog *prog); void xdp_do_flush(void); /* The xdp_do_flush_map() helper has been renamed to drop the _map suffix, as -- cgit v1.2.3 From ae16d059f8c9409eba0c412639def0494765b761 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 26 Oct 2021 18:22:44 +0200 Subject: mm/slub: Make object_err() static There are no callers outside of mm/slub.c anymore. Move freelist_corrupted() that calls object_err() to avoid a need for forward declaration. Signed-off-by: Vlastimil Babka Reviewed-by: Roman Gushchin --- include/linux/slub_def.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 0fa751b946fa..1ef68d4de9c0 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -156,9 +156,6 @@ static inline void sysfs_slab_release(struct kmem_cache *s) } #endif -void object_err(struct kmem_cache *s, struct page *page, - u8 *object, char *reason); - void *fixup_red_left(struct kmem_cache *s, void *p); static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, -- cgit v1.2.3 From d122019bf061cccc4583eb9ad40bf58c2fe517be Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 4 Oct 2021 14:45:51 +0100 Subject: mm: Split slab into its own type Make struct slab independent of struct page. It still uses the underlying memory in struct page for storing slab-specific data, but slab and slub can now be weaned off using struct page directly. Some of the wrapper functions (slab_address() and slab_order()) still need to cast to struct folio, but this is a significant disentanglement. [ vbabka@suse.cz: Rebase on folios, use folio instead of page where possible. Do not duplicate flags field in struct slab, instead make the related accessors go through slab_folio(). For testing pfmemalloc use the folio_*_active flag accessors directly so the PageSlabPfmemalloc wrappers can be removed later. Make folio_slab() expect only folio_test_slab() == true folios and virt_to_slab() return NULL when folio_test_slab() == false. Move struct slab to mm/slab.h. Don't represent with struct slab pages that are not true slab pages, but just a compound page obtained directly rom page allocator (with large kmalloc() for SLUB and SLOB). ] Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Vlastimil Babka Acked-by: Johannes Weiner Reviewed-by: Roman Gushchin --- include/linux/mm_types.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c3a6e6209600..1ae3537c7920 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -56,11 +56,11 @@ struct mem_cgroup; * in each subpage, but you may need to restore some of their values * afterwards. * - * SLUB uses cmpxchg_double() to atomically update its freelist and - * counters. That requires that freelist & counters be adjacent and - * double-word aligned. We align all struct pages to double-word - * boundaries, and ensure that 'freelist' is aligned within the - * struct. + * SLUB uses cmpxchg_double() to atomically update its freelist and counters. + * That requires that freelist & counters in struct slab be adjacent and + * double-word aligned. Because struct slab currently just reinterprets the + * bits of struct page, we align all struct pages to double-word boundaries, + * and ensure that 'freelist' is aligned within struct slab. */ #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) -- cgit v1.2.3 From 0b3eb091d5759479d44cb793fad2c51ea06bdcec Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 4 Oct 2021 14:45:56 +0100 Subject: mm: Convert check_heap_object() to use struct slab Ensure that we're not seeing a tail page inside __check_heap_object() by converting to a slab instead of a page. Take the opportunity to mark the slab as const since we're not modifying it. Also move the declaration of __check_heap_object() to mm/slab.h so it's not available to the wider kernel. [ vbabka@suse.cz: in check_heap_object() only convert to struct slab for actual PageSlab pages; use folio as intermediate step instead of page ] Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Vlastimil Babka Reviewed-by: Roman Gushchin --- include/linux/slab.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 181045148b06..367366f1d1ff 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -189,14 +189,6 @@ bool kmem_valid_obj(void *object); void kmem_dump_obj(void *object); #endif -#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR -void __check_heap_object(const void *ptr, unsigned long n, struct page *page, - bool to_user); -#else -static inline void __check_heap_object(const void *ptr, unsigned long n, - struct page *page, bool to_user) { } -#endif - /* * Some archs want to perform DMA into kmalloc caches and need a guaranteed * alignment larger than the alignment of a 64-bit integer. -- cgit v1.2.3 From bb192ed9aa7191a5d65548f82c42b6750d65f569 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 3 Nov 2021 15:39:59 +0100 Subject: mm/slub: Convert most struct page to struct slab by spatch The majority of conversion from struct page to struct slab in SLUB internals can be delegated to a coccinelle semantic patch. This includes renaming of variables with 'page' in name to 'slab', and similar. Big thanks to Julia Lawall and Luis Chamberlain for help with coccinelle. // Options: --include-headers --no-includes --smpl-spacing include/linux/slub_def.h mm/slub.c // Note: needs coccinelle 1.1.1 to avoid breaking whitespace, and ocaml for the // embedded script // build list of functions to exclude from applying the next rule @initialize:ocaml@ @@ let ok_function p = not (List.mem (List.hd p).current_element ["nearest_obj";"obj_to_index";"objs_per_slab_page";"__slab_lock";"__slab_unlock";"free_nonslab_page";"kmalloc_large_node"]) // convert the type from struct page to struct page in all functions except the // list from previous rule // this also affects struct kmem_cache_cpu, but that's ok @@ position p : script:ocaml() { ok_function p }; @@ - struct page@p + struct slab // in struct kmem_cache_cpu, change the name from page to slab // the type was already converted by the previous rule @@ @@ struct kmem_cache_cpu { ... -struct slab *page; +struct slab *slab; ... } // there are many places that use c->page which is now c->slab after the // previous rule @@ struct kmem_cache_cpu *c; @@ -c->page +c->slab @@ @@ struct kmem_cache { ... - unsigned int cpu_partial_pages; + unsigned int cpu_partial_slabs; ... } @@ struct kmem_cache *s; @@ - s->cpu_partial_pages + s->cpu_partial_slabs @@ @@ static void - setup_page_debug( + setup_slab_debug( ...) {...} @@ @@ - setup_page_debug( + setup_slab_debug( ...); // for all functions (with exceptions), change any "struct slab *page" // parameter to "struct slab *slab" in the signature, and generally all // occurences of "page" to "slab" in the body - with some special cases. @@ identifier fn !~ "free_nonslab_page|obj_to_index|objs_per_slab_page|nearest_obj"; @@ fn(..., - struct slab *page + struct slab *slab ,...) { <... - page + slab ...> } // similar to previous but the param is called partial_page @@ identifier fn; @@ fn(..., - struct slab *partial_page + struct slab *partial_slab ,...) { <... - partial_page + partial_slab ...> } // similar to previous but for functions that take pointer to struct page ptr @@ identifier fn; @@ fn(..., - struct slab **ret_page + struct slab **ret_slab ,...) { <... - ret_page + ret_slab ...> } // functions converted by previous rules that were temporarily called using // slab_page(E) so we want to remove the wrapper now that they accept struct // slab ptr directly @@ identifier fn =~ "slab_free|do_slab_free"; expression E; @@ fn(..., - slab_page(E) + E ,...) // similar to previous but for another pattern @@ identifier fn =~ "slab_pad_check|check_object"; @@ fn(..., - folio_page(folio, 0) + slab ,...) // functions that were returning struct page ptr and now will return struct // slab ptr, including slab_page() wrapper removal @@ identifier fn =~ "allocate_slab|new_slab"; expression E; @@ static -struct slab * +struct slab * fn(...) { <... - slab_page(E) + E ...> } // rename any former struct page * declarations @@ @@ struct slab * ( - page + slab | - partial_page + partial_slab | - oldpage + oldslab ) ; // this has to be separate from previous rule as page and page2 appear at the // same line @@ @@ struct slab * -page2 +slab2 ; // similar but with initial assignment @@ expression E; @@ struct slab * ( - page + slab | - flush_page + flush_slab | - discard_page + slab_to_discard | - page_to_unfreeze + slab_to_unfreeze ) = E; // convert most of struct page to struct slab usage inside functions (with // exceptions), including specific variable renames @@ identifier fn !~ "nearest_obj|obj_to_index|objs_per_slab_page|__slab_(un)*lock|__free_slab|free_nonslab_page|kmalloc_large_node"; expression E; @@ fn(...) { <... ( - int pages; + int slabs; | - int pages = E; + int slabs = E; | - page + slab | - flush_page + flush_slab | - partial_page + partial_slab | - oldpage->pages + oldslab->slabs | - oldpage + oldslab | - unsigned int nr_pages; + unsigned int nr_slabs; | - nr_pages + nr_slabs | - unsigned int partial_pages = E; + unsigned int partial_slabs = E; | - partial_pages + partial_slabs ) ...> } // this has to be split out from the previous rule so that lines containing // multiple matching changes will be fully converted @@ identifier fn !~ "nearest_obj|obj_to_index|objs_per_slab_page|__slab_(un)*lock|__free_slab|free_nonslab_page|kmalloc_large_node"; @@ fn(...) { <... ( - slab->pages + slab->slabs | - pages + slabs | - page2 + slab2 | - discard_page + slab_to_discard | - page_to_unfreeze + slab_to_unfreeze ) ...> } // after we simply changed all occurences of page to slab, some usages need // adjustment for slab-specific functions, or use slab_page() wrapper @@ identifier fn !~ "nearest_obj|obj_to_index|objs_per_slab_page|__slab_(un)*lock|__free_slab|free_nonslab_page|kmalloc_large_node"; @@ fn(...) { <... ( - page_slab(slab) + slab | - kasan_poison_slab(slab) + kasan_poison_slab(slab_page(slab)) | - page_address(slab) + slab_address(slab) | - page_size(slab) + slab_size(slab) | - PageSlab(slab) + folio_test_slab(slab_folio(slab)) | - page_to_nid(slab) + slab_nid(slab) | - compound_order(slab) + slab_order(slab) ) ...> } Signed-off-by: Vlastimil Babka Reviewed-by: Roman Gushchin Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Julia Lawall Cc: Luis Chamberlain --- include/linux/slub_def.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 1ef68d4de9c0..00d99afe1c0e 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -48,9 +48,9 @@ enum stat_item { struct kmem_cache_cpu { void **freelist; /* Pointer to next available object */ unsigned long tid; /* Globally unique transaction id */ - struct page *page; /* The slab from which we are allocating */ + struct slab *slab; /* The slab from which we are allocating */ #ifdef CONFIG_SLUB_CPU_PARTIAL - struct page *partial; /* Partially allocated frozen slabs */ + struct slab *partial; /* Partially allocated frozen slabs */ #endif local_lock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS @@ -100,7 +100,7 @@ struct kmem_cache { /* Number of per cpu partial objects to keep around */ unsigned int cpu_partial; /* Number of per cpu partial pages to keep around */ - unsigned int cpu_partial_pages; + unsigned int cpu_partial_slabs; #endif struct kmem_cache_order_objects oo; -- cgit v1.2.3 From c2092c12064a9728b2928979f88575cc1c2247fa Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 15 Nov 2021 16:55:15 +0100 Subject: mm/slub: Finish struct page to struct slab conversion Update comments mentioning pages to mention slabs where appropriate. Also some goto labels. Signed-off-by: Vlastimil Babka Reviewed-by: Roman Gushchin --- include/linux/slub_def.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 00d99afe1c0e..8a9c2876ca89 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -99,7 +99,7 @@ struct kmem_cache { #ifdef CONFIG_SLUB_CPU_PARTIAL /* Number of per cpu partial objects to keep around */ unsigned int cpu_partial; - /* Number of per cpu partial pages to keep around */ + /* Number of per cpu partial slabs to keep around */ unsigned int cpu_partial_slabs; #endif struct kmem_cache_order_objects oo; -- cgit v1.2.3 From 40f3bf0cb04c91d33531b1b95788ad2f0e4062cf Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 2 Nov 2021 15:42:04 +0100 Subject: mm: Convert struct page to struct slab in functions used by other subsystems KASAN, KFENCE and memcg interact with SLAB or SLUB internals through functions nearest_obj(), obj_to_index() and objs_per_slab() that use struct page as parameter. This patch converts it to struct slab including all callers, through a coccinelle semantic patch. // Options: --include-headers --no-includes --smpl-spacing include/linux/slab_def.h include/linux/slub_def.h mm/slab.h mm/kasan/*.c mm/kfence/kfence_test.c mm/memcontrol.c mm/slab.c mm/slub.c // Note: needs coccinelle 1.1.1 to avoid breaking whitespace @@ @@ -objs_per_slab_page( +objs_per_slab( ... ) { ... } @@ @@ -objs_per_slab_page( +objs_per_slab( ... ) @@ identifier fn =~ "obj_to_index|objs_per_slab"; @@ fn(..., - const struct page *page + const struct slab *slab ,...) { <... ( - page_address(page) + slab_address(slab) | - page + slab ) ...> } @@ identifier fn =~ "nearest_obj"; @@ fn(..., - struct page *page + const struct slab *slab ,...) { <... ( - page_address(page) + slab_address(slab) | - page + slab ) ...> } @@ identifier fn =~ "nearest_obj|obj_to_index|objs_per_slab"; expression E; @@ fn(..., ( - slab_page(E) + E | - virt_to_page(E) + virt_to_slab(E) | - virt_to_head_page(E) + virt_to_slab(E) | - page + page_slab(page) ) ,...) Signed-off-by: Vlastimil Babka Reviewed-by: Andrey Konovalov Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Julia Lawall Cc: Luis Chamberlain Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Marco Elver Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Cc: --- include/linux/slab_def.h | 16 ++++++++-------- include/linux/slub_def.h | 18 +++++++++--------- 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 3aa5e1e73ab6..e24c9aff6fed 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -87,11 +87,11 @@ struct kmem_cache { struct kmem_cache_node *node[MAX_NUMNODES]; }; -static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, +static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab, void *x) { - void *object = x - (x - page->s_mem) % cache->size; - void *last_object = page->s_mem + (cache->num - 1) * cache->size; + void *object = x - (x - slab->s_mem) % cache->size; + void *last_object = slab->s_mem + (cache->num - 1) * cache->size; if (unlikely(object > last_object)) return last_object; @@ -106,16 +106,16 @@ static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, * reciprocal_divide(offset, cache->reciprocal_buffer_size) */ static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct page *page, void *obj) + const struct slab *slab, void *obj) { - u32 offset = (obj - page->s_mem); + u32 offset = (obj - slab->s_mem); return reciprocal_divide(offset, cache->reciprocal_buffer_size); } -static inline int objs_per_slab_page(const struct kmem_cache *cache, - const struct page *page) +static inline int objs_per_slab(const struct kmem_cache *cache, + const struct slab *slab) { - if (is_kfence_address(page_address(page))) + if (is_kfence_address(slab_address(slab))) return 1; return cache->num; } diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 8a9c2876ca89..33c5c0e3bd8d 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -158,11 +158,11 @@ static inline void sysfs_slab_release(struct kmem_cache *s) void *fixup_red_left(struct kmem_cache *s, void *p); -static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, +static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab, void *x) { - void *object = x - (x - page_address(page)) % cache->size; - void *last_object = page_address(page) + - (page->objects - 1) * cache->size; + void *object = x - (x - slab_address(slab)) % cache->size; + void *last_object = slab_address(slab) + + (slab->objects - 1) * cache->size; void *result = (unlikely(object > last_object)) ? last_object : object; result = fixup_red_left(cache, result); @@ -178,16 +178,16 @@ static inline unsigned int __obj_to_index(const struct kmem_cache *cache, } static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct page *page, void *obj) + const struct slab *slab, void *obj) { if (is_kfence_address(obj)) return 0; - return __obj_to_index(cache, page_address(page), obj); + return __obj_to_index(cache, slab_address(slab), obj); } -static inline int objs_per_slab_page(const struct kmem_cache *cache, - const struct page *page) +static inline int objs_per_slab(const struct kmem_cache *cache, + const struct slab *slab) { - return page->objects; + return slab->objects; } #endif /* _LINUX_SLUB_DEF_H */ -- cgit v1.2.3 From 4b5f8d9a895ada8e0abb58ccd35d9fe229e3a595 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 2 Nov 2021 22:42:04 +0100 Subject: mm/memcg: Convert slab objcgs from struct page to struct slab page->memcg_data is used with MEMCG_DATA_OBJCGS flag only for slab pages so convert all the related infrastructure to struct slab. Also use struct folio instead of struct page when resolving object pointers. This is not just mechanistic changing of types and names. Now in mem_cgroup_from_obj() we use folio_test_slab() to decide if we interpret the folio as a real slab instead of a large kmalloc, instead of relying on MEMCG_DATA_OBJCGS bit that used to be checked in page_objcgs_check(). Similarly in memcg_slab_free_hook() where we can encounter kmalloc_large() pages (here the folio slab flag check is implied by virt_to_slab()). As a result, page_objcgs_check() can be dropped instead of converted. To avoid include cycles, move the inline definition of slab_objcgs() from memcontrol.h to mm/slab.h. Signed-off-by: Vlastimil Babka Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: --- include/linux/memcontrol.h | 48 ---------------------------------------------- 1 file changed, 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0c5c403f4be6..e34112f6a369 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -536,45 +536,6 @@ static inline bool folio_memcg_kmem(struct folio *folio) return folio->memcg_data & MEMCG_DATA_KMEM; } -/* - * page_objcgs - get the object cgroups vector associated with a page - * @page: a pointer to the page struct - * - * Returns a pointer to the object cgroups vector associated with the page, - * or NULL. This function assumes that the page is known to have an - * associated object cgroups vector. It's not safe to call this function - * against pages, which might have an associated memory cgroup: e.g. - * kernel stack pages. - */ -static inline struct obj_cgroup **page_objcgs(struct page *page) -{ - unsigned long memcg_data = READ_ONCE(page->memcg_data); - - VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), page); - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); - - return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); -} - -/* - * page_objcgs_check - get the object cgroups vector associated with a page - * @page: a pointer to the page struct - * - * Returns a pointer to the object cgroups vector associated with the page, - * or NULL. This function is safe to use if the page can be directly associated - * with a memory cgroup. - */ -static inline struct obj_cgroup **page_objcgs_check(struct page *page) -{ - unsigned long memcg_data = READ_ONCE(page->memcg_data); - - if (!memcg_data || !(memcg_data & MEMCG_DATA_OBJCGS)) - return NULL; - - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); - - return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); -} #else static inline bool folio_memcg_kmem(struct folio *folio) @@ -582,15 +543,6 @@ static inline bool folio_memcg_kmem(struct folio *folio) return false; } -static inline struct obj_cgroup **page_objcgs(struct page *page) -{ - return NULL; -} - -static inline struct obj_cgroup **page_objcgs_check(struct page *page) -{ - return NULL; -} #endif static inline bool PageMemcgKmem(struct page *page) -- cgit v1.2.3 From 6e48a966dfd18987fec9385566a67d36e2b5fc11 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 4 Oct 2021 14:46:46 +0100 Subject: mm/kasan: Convert to struct folio and struct slab KASAN accesses some slab related struct page fields so we need to convert it to struct slab. Some places are a bit simplified thanks to kasan_addr_to_slab() encapsulating the PageSlab flag check through virt_to_slab(). When resolving object address to either a real slab or a large kmalloc, use struct folio as the intermediate type for testing the slab flag to avoid unnecessary implicit compound_head(). [ vbabka@suse.cz: use struct folio, adjust to differences in previous patches ] Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Vlastimil Babka Reviewed-by: Andrey Konovalov Reviewed-by: Roman Gushchin Tested-by: Hyeongogn Yoo <42.hyeyoo@gmail.com> Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: --- include/linux/kasan.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d8783b682669..fb78108d694e 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -9,6 +9,7 @@ struct kmem_cache; struct page; +struct slab; struct vm_struct; struct task_struct; @@ -193,11 +194,11 @@ static __always_inline size_t kasan_metadata_size(struct kmem_cache *cache) return 0; } -void __kasan_poison_slab(struct page *page); -static __always_inline void kasan_poison_slab(struct page *page) +void __kasan_poison_slab(struct slab *slab); +static __always_inline void kasan_poison_slab(struct slab *slab) { if (kasan_enabled()) - __kasan_poison_slab(page); + __kasan_poison_slab(slab); } void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object); @@ -322,7 +323,7 @@ static inline void kasan_cache_create(struct kmem_cache *cache, slab_flags_t *flags) {} static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } -static inline void kasan_poison_slab(struct page *page) {} +static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) {} static inline void kasan_poison_object_data(struct kmem_cache *cache, -- cgit v1.2.3 From c5e97ed154589524a1df4ae2be55c4cfdb0d0573 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 4 Oct 2021 14:46:48 +0100 Subject: bootmem: Use page->index instead of page->freelist page->freelist is for the use of slab. Using page->index is the same set of bits as page->freelist, and by using an integer instead of a pointer, we can avoid casts. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Vlastimil Babka Acked-by: Johannes Weiner Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Cc: "H. Peter Anvin" --- include/linux/bootmem_info.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h index 2bc8b1f69c93..cc35d010fa94 100644 --- a/include/linux/bootmem_info.h +++ b/include/linux/bootmem_info.h @@ -30,7 +30,7 @@ void put_page_bootmem(struct page *page); */ static inline void free_bootmem_page(struct page *page) { - unsigned long magic = (unsigned long)page->freelist; + unsigned long magic = page->index; /* * The reserve_bootmem_region sets the reserved flag on bootmem -- cgit v1.2.3 From 007747a984ea5e895b7d8b056b24ebf431e1e71d Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Wed, 5 Jan 2022 11:33:26 +0100 Subject: net: fix SOF_TIMESTAMPING_BIND_PHC to work with multiple sockets When multiple sockets using the SOF_TIMESTAMPING_BIND_PHC flag received a packet with a hardware timestamp (e.g. multiple PTP instances in different PTP domains using the UDPv4/v6 multicast or L2 transport), the timestamps received on some sockets were corrupted due to repeated conversion of the same timestamp (by the same or different vclocks). Fix ptp_convert_timestamp() to not modify the shared skb timestamp and return the converted timestamp as a ktime_t instead. If the conversion fails, return 0 to not confuse the application with timestamps corresponding to an unexpected PHC. Fixes: d7c088265588 ("net: socket: support hardware timestamp conversion to PHC bound") Signed-off-by: Miroslav Lichvar Cc: Yangbo Lu Cc: Richard Cochran Acked-by: Richard Cochran Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 2e5565067355..554454cb8693 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -351,15 +351,17 @@ int ptp_get_vclocks_index(int pclock_index, int **vclock_index); * * @hwtstamps: skb_shared_hwtstamps structure pointer * @vclock_index: phc index of ptp vclock. + * + * Returns converted timestamp, or 0 on error. */ -void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps, - int vclock_index); +ktime_t ptp_convert_timestamp(const struct skb_shared_hwtstamps *hwtstamps, + int vclock_index); #else static inline int ptp_get_vclocks_index(int pclock_index, int **vclock_index) { return 0; } -static inline void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps, - int vclock_index) -{ } +static inline ktime_t ptp_convert_timestamp(const struct skb_shared_hwtstamps *hwtstamps, + int vclock_index) +{ return 0; } #endif -- cgit v1.2.3 From eac1b93c14d645ef147b049ace0d5230df755548 Mon Sep 17 00:00:00 2001 From: Coco Li Date: Wed, 5 Jan 2022 02:48:38 -0800 Subject: gro: add ability to control gro max packet size Eric Dumazet suggested to allow users to modify max GRO packet size. We have seen GRO being disabled by users of appliances (such as wifi access points) because of claimed bufferbloat issues, or some work arounds in sch_cake, to split GRO/GSO packets. Instead of disabling GRO completely, one can chose to limit the maximum packet size of GRO packets, depending on their latency constraints. This patch adds a per device gro_max_size attribute that can be changed with ip link command. ip link set dev eth0 gro_max_size 16000 Suggested-by: Eric Dumazet Signed-off-by: Coco Li Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6f99c8f51b60..3213c7227b59 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1942,6 +1942,8 @@ enum netdev_ml_priv_type { * dev->addr_list_lock. * @unlink_list: As netif_addr_lock() can be called recursively, * keep a list of interfaces to be deleted. + * @gro_max_size: Maximum size of aggregated packet in generic + * receive offload (GRO) * * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. * @linkwatch_dev_tracker: refcount tracker used by linkwatch. @@ -2131,6 +2133,8 @@ struct net_device { struct bpf_prog __rcu *xdp_prog; unsigned long gro_flush_timeout; int napi_defer_hard_irqs; +#define GRO_MAX_SIZE 65536 + unsigned int gro_max_size; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; @@ -4806,6 +4810,13 @@ static inline void netif_set_gso_max_segs(struct net_device *dev, WRITE_ONCE(dev->gso_max_segs, segs); } +static inline void netif_set_gro_max_size(struct net_device *dev, + unsigned int size) +{ + /* This pairs with the READ_ONCE() in skb_gro_receive() */ + WRITE_ONCE(dev->gro_max_size, size); +} + static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) -- cgit v1.2.3 From 3809fe479861194e310c23ed48b010c7c0f72d22 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Thu, 16 Dec 2021 10:21:57 +0100 Subject: HID: address kernel-doc warnings The command ./scripts/kernel-doc -none include/linux/hid.h reports: include/linux/hid.h:818: warning: cannot understand function prototype: 'struct hid_ll_driver ' include/linux/hid.h:1135: warning: expecting prototype for hid_may_wakeup(). Prototype was for hid_hw_may_wakeup() instead Address those kernel-doc warnings. Signed-off-by: Lukas Bulwahn Signed-off-by: Jiri Kosina --- include/linux/hid.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index f453be385bd4..aaf89a8a836c 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -788,7 +788,7 @@ struct hid_driver { container_of(pdrv, struct hid_driver, driver) /** - * hid_ll_driver - low level driver callbacks + * struct hid_ll_driver - low level driver callbacks * @start: called on probe to start the device * @stop: called on remove * @open: called by input layer on open @@ -1158,7 +1158,7 @@ static inline int hid_hw_idle(struct hid_device *hdev, int report, int idle, } /** - * hid_may_wakeup - return if the hid device may act as a wakeup source during system-suspend + * hid_hw_may_wakeup - return if the hid device may act as a wakeup source during system-suspend * * @hdev: hid device */ -- cgit v1.2.3 From 36dacddbf0bdba86cd00f066b4d724157eeb63f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dirk=20M=C3=BCller?= Date: Wed, 5 Jan 2022 17:38:47 +0100 Subject: lib/raid6: Use strict priority ranking for pq gen() benchmarking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On x86_64, currently 3 variants of AVX512, 3 variants of AVX2 and 3 variants of SSE2 are benchmarked on initialization, taking between 144-153 jiffies. Testing across a hardware pool of various generations of intel cpus I could not find a single case where SSE2 won over AVX2 or AVX512. There are cases where AVX2 wins over AVX512 however. Change "prefer" into an integer priority field (similar to how recov selection works) to have more than one ranking level available, which is backwards compatible with existing behavior. Give AVX2/512 variants higher priority over SSE2 in order to skip SSE testing when AVX is available. in a AVX2/x86_64/HZ=250 case this saves in the order of 200ms of initialization time. Signed-off-by: Dirk Müller Acked-by: Paul Menzel Signed-off-by: Song Liu --- include/linux/raid/pq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 154e954b711d..d6e5a1feb947 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -81,7 +81,7 @@ struct raid6_calls { void (*xor_syndrome)(int, int, int, size_t, void **); int (*valid)(void); /* Returns 1 if this routine set is usable */ const char *name; /* Name of this routine set */ - int prefer; /* Has special performance attribute */ + int priority; /* Relative priority ranking if non-zero */ }; /* Selected algorithm */ -- cgit v1.2.3 From 07f910f9b7295b6a28b337fedb56e612684c5659 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 4 Oct 2021 14:46:50 +0100 Subject: mm: Remove slab from struct page All members of struct slab can now be removed from struct page. This shrinks the definition of struct page by 30 LOC, making it easier to understand. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Vlastimil Babka --- include/linux/mm_types.h | 28 ---------------------------- include/linux/page-flags.h | 37 ------------------------------------- 2 files changed, 65 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1ae3537c7920..646f3ed4f6df 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -118,31 +118,6 @@ struct page { atomic_long_t pp_frag_count; }; }; - struct { /* slab, slob and slub */ - union { - struct list_head slab_list; - struct { /* Partial pages */ - struct page *next; -#ifdef CONFIG_64BIT - int pages; /* Nr of pages left */ -#else - short int pages; -#endif - }; - }; - struct kmem_cache *slab_cache; /* not slob */ - /* Double-word boundary */ - void *freelist; /* first free object */ - union { - void *s_mem; /* slab: first object */ - unsigned long counters; /* SLUB */ - struct { /* SLUB */ - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; - }; - }; - }; struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ @@ -206,9 +181,6 @@ struct page { * which are currently stored here. */ unsigned int page_type; - - unsigned int active; /* SLAB */ - int units; /* SLOB */ }; /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b5f14d581113..1b08e33265fa 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -909,43 +909,6 @@ extern bool is_free_buddy_page(struct page *page); __PAGEFLAG(Isolated, isolated, PF_ANY); -/* - * If network-based swap is enabled, sl*b must keep track of whether pages - * were allocated from pfmemalloc reserves. - */ -static inline int PageSlabPfmemalloc(struct page *page) -{ - VM_BUG_ON_PAGE(!PageSlab(page), page); - return PageActive(page); -} - -/* - * A version of PageSlabPfmemalloc() for opportunistic checks where the page - * might have been freed under us and not be a PageSlab anymore. - */ -static inline int __PageSlabPfmemalloc(struct page *page) -{ - return PageActive(page); -} - -static inline void SetPageSlabPfmemalloc(struct page *page) -{ - VM_BUG_ON_PAGE(!PageSlab(page), page); - SetPageActive(page); -} - -static inline void __ClearPageSlabPfmemalloc(struct page *page) -{ - VM_BUG_ON_PAGE(!PageSlab(page), page); - __ClearPageActive(page); -} - -static inline void ClearPageSlabPfmemalloc(struct page *page) -{ - VM_BUG_ON_PAGE(!PageSlab(page), page); - ClearPageActive(page); -} - #ifdef CONFIG_MMU #define __PG_MLOCKED (1UL << PG_mlocked) #else -- cgit v1.2.3 From 703f7066f40599c290babdb79dd61319264987e9 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 7 Dec 2021 13:17:33 +0100 Subject: random: remove unused irq_flags argument from add_interrupt_randomness() Since commit ee3e00e9e7101 ("random: use registers from interrupted code for CPU's w/o a cycle counter") the irq_flags argument is no longer used. Remove unused irq_flags. Cc: Borislav Petkov Cc: Dave Hansen Cc: Dexuan Cui Cc: H. Peter Anvin Cc: Haiyang Zhang Cc: Ingo Molnar Cc: K. Y. Srinivasan Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: Wei Liu Cc: linux-hyperv@vger.kernel.org Cc: x86@kernel.org Signed-off-by: Sebastian Andrzej Siewior Acked-by: Wei Liu Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index f45b8be3e3c4..c45b2693e51f 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -35,7 +35,7 @@ static inline void add_latent_entropy(void) {} extern void add_input_randomness(unsigned int type, unsigned int code, unsigned int value) __latent_entropy; -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy; +extern void add_interrupt_randomness(int irq) __latent_entropy; extern void get_random_bytes(void *buf, int nbytes); extern int wait_for_random_bytes(void); -- cgit v1.2.3 From 79b60ca83b6fa63ef307d2edcc77ee6581da8971 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Sun, 12 Dec 2021 14:51:27 +0200 Subject: net/mlx5: Introduce API for bulk request and release of IRQs Currently IRQs are requested one by one. To balance spreading IRQs among cpus using such scheme requires remembering cpu mask for the cpus used for a given device. This complicates the IRQ allocation scheme in subsequent patch. Hence, prepare the code for bulk IRQs allocation. This enables spreading IRQs among cpus in subsequent patch. Signed-off-by: Shay Drory Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- include/linux/mlx5/eq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h index ea3ff5a8ced3..3705a382276b 100644 --- a/include/linux/mlx5/eq.h +++ b/include/linux/mlx5/eq.h @@ -9,13 +9,13 @@ #define MLX5_NUM_SPARE_EQE (0x80) struct mlx5_eq; +struct mlx5_irq; struct mlx5_core_dev; struct mlx5_eq_param { - u8 irq_index; int nent; u64 mask[4]; - cpumask_var_t affinity; + struct mlx5_irq *irq; }; struct mlx5_eq * -- cgit v1.2.3 From 3663f26b389b3951426971b44bb9312fdff0efec Mon Sep 17 00:00:00 2001 From: Ajit Kumar Pandey Date: Sun, 12 Dec 2021 23:35:24 +0530 Subject: drivers: acpi: acpi_apd: Remove unused device property "is-rv" Initially "is-rv" device property is added for 48MHz fixed clock support on Raven or RV architecture. It's unused now as we moved to pci device_id based selection to extend such support on other architectures. This change removed unused code from acpi driver. Signed-off-by: Ajit Kumar Pandey Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20211212180527.1641362-3-AjitKumar.Pandey@amd.com Signed-off-by: Stephen Boyd --- include/linux/platform_data/clk-fch.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/clk-fch.h b/include/linux/platform_data/clk-fch.h index b9f682459f08..850ca776156d 100644 --- a/include/linux/platform_data/clk-fch.h +++ b/include/linux/platform_data/clk-fch.h @@ -12,7 +12,6 @@ struct fch_clk_data { void __iomem *base; - u32 is_rv; }; #endif /* __CLK_FCH_H */ -- cgit v1.2.3 From 7fdb98e8a768b3ccc05494d3ea4436047f512b9d Mon Sep 17 00:00:00 2001 From: Ajit Kumar Pandey Date: Sun, 12 Dec 2021 23:35:25 +0530 Subject: ACPI: APD: Add a fmw property clk-name Add a new device property to fetch clk-name from firmware. Signed-off-by: Ajit Kumar Pandey Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20211212180527.1641362-4-AjitKumar.Pandey@amd.com Signed-off-by: Stephen Boyd --- include/linux/platform_data/clk-fch.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/clk-fch.h b/include/linux/platform_data/clk-fch.h index 850ca776156d..11a2a23fd9b2 100644 --- a/include/linux/platform_data/clk-fch.h +++ b/include/linux/platform_data/clk-fch.h @@ -12,6 +12,7 @@ struct fch_clk_data { void __iomem *base; + char *name; }; #endif /* __CLK_FCH_H */ -- cgit v1.2.3 From 2cee6fbb7f01bcb25f11ef1439e89a29de4c0c1d Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Oct 2021 21:53:44 +0100 Subject: fscache: Remove the contents of the fscache driver, pending rewrite Remove the code that comprises the fscache driver as it's going to be substantially rewritten, with the majority of the code being erased in the rewrite. A small piece of linux/fscache.h is left as that is #included by a bunch of network filesystems. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819578724.215744.18210619052245724238.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906884814.143852.6727245089843862889.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967077097.1823006.1377665951499979089.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021485548.640689.13876080567388696162.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache-cache.h | 548 +-------------------------- include/linux/fscache.h | 851 +----------------------------------------- 2 files changed, 3 insertions(+), 1396 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 8d39491c5f9f..47f21a53ac4b 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* General filesystem caching backing cache interface * - * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * NOTE!!! See: @@ -15,551 +15,5 @@ #define _LINUX_FSCACHE_CACHE_H #include -#include -#include - -#define NR_MAXCACHES BITS_PER_LONG - -struct fscache_cache; -struct fscache_cache_ops; -struct fscache_object; -struct fscache_operation; - -enum fscache_obj_ref_trace { - fscache_obj_get_add_to_deps, - fscache_obj_get_queue, - fscache_obj_put_alloc_fail, - fscache_obj_put_attach_fail, - fscache_obj_put_drop_obj, - fscache_obj_put_enq_dep, - fscache_obj_put_queue, - fscache_obj_put_work, - fscache_obj_ref__nr_traces -}; - -/* - * cache tag definition - */ -struct fscache_cache_tag { - struct list_head link; - struct fscache_cache *cache; /* cache referred to by this tag */ - unsigned long flags; -#define FSCACHE_TAG_RESERVED 0 /* T if tag is reserved for a cache */ - atomic_t usage; - char name[]; /* tag name */ -}; - -/* - * cache definition - */ -struct fscache_cache { - const struct fscache_cache_ops *ops; - struct fscache_cache_tag *tag; /* tag representing this cache */ - struct kobject *kobj; /* system representation of this cache */ - struct list_head link; /* link in list of caches */ - size_t max_index_size; /* maximum size of index data */ - char identifier[36]; /* cache label */ - - /* node management */ - struct work_struct op_gc; /* operation garbage collector */ - struct list_head object_list; /* list of data/index objects */ - struct list_head op_gc_list; /* list of ops to be deleted */ - spinlock_t object_list_lock; - spinlock_t op_gc_list_lock; - atomic_t object_count; /* no. of live objects in this cache */ - struct fscache_object *fsdef; /* object for the fsdef index */ - unsigned long flags; -#define FSCACHE_IOERROR 0 /* cache stopped on I/O error */ -#define FSCACHE_CACHE_WITHDRAWN 1 /* cache has been withdrawn */ -}; - -extern wait_queue_head_t fscache_cache_cleared_wq; - -/* - * operation to be applied to a cache object - * - retrieval initiation operations are done in the context of the process - * that issued them, and not in an async thread pool - */ -typedef void (*fscache_operation_release_t)(struct fscache_operation *op); -typedef void (*fscache_operation_processor_t)(struct fscache_operation *op); -typedef void (*fscache_operation_cancel_t)(struct fscache_operation *op); - -enum fscache_operation_state { - FSCACHE_OP_ST_BLANK, /* Op is not yet submitted */ - FSCACHE_OP_ST_INITIALISED, /* Op is initialised */ - FSCACHE_OP_ST_PENDING, /* Op is blocked from running */ - FSCACHE_OP_ST_IN_PROGRESS, /* Op is in progress */ - FSCACHE_OP_ST_COMPLETE, /* Op is complete */ - FSCACHE_OP_ST_CANCELLED, /* Op has been cancelled */ - FSCACHE_OP_ST_DEAD /* Op is now dead */ -}; - -struct fscache_operation { - struct work_struct work; /* record for async ops */ - struct list_head pend_link; /* link in object->pending_ops */ - struct fscache_object *object; /* object to be operated upon */ - - unsigned long flags; -#define FSCACHE_OP_TYPE 0x000f /* operation type */ -#define FSCACHE_OP_ASYNC 0x0001 /* - async op, processor may sleep for disk */ -#define FSCACHE_OP_MYTHREAD 0x0002 /* - processing is done be issuing thread, not pool */ -#define FSCACHE_OP_WAITING 4 /* cleared when op is woken */ -#define FSCACHE_OP_EXCLUSIVE 5 /* exclusive op, other ops must wait */ -#define FSCACHE_OP_DEC_READ_CNT 6 /* decrement object->n_reads on destruction */ -#define FSCACHE_OP_UNUSE_COOKIE 7 /* call fscache_unuse_cookie() on completion */ -#define FSCACHE_OP_KEEP_FLAGS 0x00f0 /* flags to keep when repurposing an op */ - - enum fscache_operation_state state; - atomic_t usage; - unsigned debug_id; /* debugging ID */ - - /* operation processor callback - * - can be NULL if FSCACHE_OP_WAITING is going to be used to perform - * the op in a non-pool thread */ - fscache_operation_processor_t processor; - - /* Operation cancellation cleanup (optional) */ - fscache_operation_cancel_t cancel; - - /* operation releaser */ - fscache_operation_release_t release; -}; - -extern atomic_t fscache_op_debug_id; -extern void fscache_op_work_func(struct work_struct *work); - -extern void fscache_enqueue_operation(struct fscache_operation *); -extern void fscache_op_complete(struct fscache_operation *, bool); -extern void fscache_put_operation(struct fscache_operation *); -extern void fscache_operation_init(struct fscache_cookie *, - struct fscache_operation *, - fscache_operation_processor_t, - fscache_operation_cancel_t, - fscache_operation_release_t); - -/* - * data read operation - */ -struct fscache_retrieval { - struct fscache_operation op; - struct fscache_cookie *cookie; /* The netfs cookie */ - struct address_space *mapping; /* netfs pages */ - fscache_rw_complete_t end_io_func; /* function to call on I/O completion */ - void *context; /* netfs read context (pinned) */ - struct list_head to_do; /* list of things to be done by the backend */ - atomic_t n_pages; /* number of pages to be retrieved */ -}; - -typedef int (*fscache_page_retrieval_func_t)(struct fscache_retrieval *op, - struct page *page, - gfp_t gfp); - -typedef int (*fscache_pages_retrieval_func_t)(struct fscache_retrieval *op, - struct list_head *pages, - unsigned *nr_pages, - gfp_t gfp); - -/** - * fscache_get_retrieval - Get an extra reference on a retrieval operation - * @op: The retrieval operation to get a reference on - * - * Get an extra reference on a retrieval operation. - */ -static inline -struct fscache_retrieval *fscache_get_retrieval(struct fscache_retrieval *op) -{ - atomic_inc(&op->op.usage); - return op; -} - -/** - * fscache_enqueue_retrieval - Enqueue a retrieval operation for processing - * @op: The retrieval operation affected - * - * Enqueue a retrieval operation for processing by the FS-Cache thread pool. - */ -static inline void fscache_enqueue_retrieval(struct fscache_retrieval *op) -{ - fscache_enqueue_operation(&op->op); -} - -/** - * fscache_retrieval_complete - Record (partial) completion of a retrieval - * @op: The retrieval operation affected - * @n_pages: The number of pages to account for - */ -static inline void fscache_retrieval_complete(struct fscache_retrieval *op, - int n_pages) -{ - if (atomic_sub_return_relaxed(n_pages, &op->n_pages) <= 0) - fscache_op_complete(&op->op, false); -} - -/** - * fscache_put_retrieval - Drop a reference to a retrieval operation - * @op: The retrieval operation affected - * - * Drop a reference to a retrieval operation. - */ -static inline void fscache_put_retrieval(struct fscache_retrieval *op) -{ - fscache_put_operation(&op->op); -} - -/* - * cached page storage work item - * - used to do three things: - * - batch writes to the cache - * - do cache writes asynchronously - * - defer writes until cache object lookup completion - */ -struct fscache_storage { - struct fscache_operation op; - pgoff_t store_limit; /* don't write more than this */ -}; - -/* - * cache operations - */ -struct fscache_cache_ops { - /* name of cache provider */ - const char *name; - - /* allocate an object record for a cookie */ - struct fscache_object *(*alloc_object)(struct fscache_cache *cache, - struct fscache_cookie *cookie); - - /* look up the object for a cookie - * - return -ETIMEDOUT to be requeued - */ - int (*lookup_object)(struct fscache_object *object); - - /* finished looking up */ - void (*lookup_complete)(struct fscache_object *object); - - /* increment the usage count on this object (may fail if unmounting) */ - struct fscache_object *(*grab_object)(struct fscache_object *object, - enum fscache_obj_ref_trace why); - - /* pin an object in the cache */ - int (*pin_object)(struct fscache_object *object); - - /* unpin an object in the cache */ - void (*unpin_object)(struct fscache_object *object); - - /* check the consistency between the backing cache and the FS-Cache - * cookie */ - int (*check_consistency)(struct fscache_operation *op); - - /* store the updated auxiliary data on an object */ - void (*update_object)(struct fscache_object *object); - - /* Invalidate an object */ - void (*invalidate_object)(struct fscache_operation *op); - - /* discard the resources pinned by an object and effect retirement if - * necessary */ - void (*drop_object)(struct fscache_object *object); - - /* dispose of a reference to an object */ - void (*put_object)(struct fscache_object *object, - enum fscache_obj_ref_trace why); - - /* sync a cache */ - void (*sync_cache)(struct fscache_cache *cache); - - /* notification that the attributes of a non-index object (such as - * i_size) have changed */ - int (*attr_changed)(struct fscache_object *object); - - /* reserve space for an object's data and associated metadata */ - int (*reserve_space)(struct fscache_object *object, loff_t i_size); - - /* request a backing block for a page be read or allocated in the - * cache */ - fscache_page_retrieval_func_t read_or_alloc_page; - - /* request backing blocks for a list of pages be read or allocated in - * the cache */ - fscache_pages_retrieval_func_t read_or_alloc_pages; - - /* request a backing block for a page be allocated in the cache so that - * it can be written directly */ - fscache_page_retrieval_func_t allocate_page; - - /* request backing blocks for pages be allocated in the cache so that - * they can be written directly */ - fscache_pages_retrieval_func_t allocate_pages; - - /* write a page to its backing block in the cache */ - int (*write_page)(struct fscache_storage *op, struct page *page); - - /* detach backing block from a page (optional) - * - must release the cookie lock before returning - * - may sleep - */ - void (*uncache_page)(struct fscache_object *object, - struct page *page); - - /* dissociate a cache from all the pages it was backing */ - void (*dissociate_pages)(struct fscache_cache *cache); - - /* Begin a read operation for the netfs lib */ - int (*begin_read_operation)(struct netfs_read_request *rreq, - struct fscache_retrieval *op); -}; - -extern struct fscache_cookie fscache_fsdef_index; - -/* - * Event list for fscache_object::{event_mask,events} - */ -enum { - FSCACHE_OBJECT_EV_NEW_CHILD, /* T if object has a new child */ - FSCACHE_OBJECT_EV_PARENT_READY, /* T if object's parent is ready */ - FSCACHE_OBJECT_EV_UPDATE, /* T if object should be updated */ - FSCACHE_OBJECT_EV_INVALIDATE, /* T if cache requested object invalidation */ - FSCACHE_OBJECT_EV_CLEARED, /* T if accessors all gone */ - FSCACHE_OBJECT_EV_ERROR, /* T if fatal error occurred during processing */ - FSCACHE_OBJECT_EV_KILL, /* T if netfs relinquished or cache withdrew object */ - NR_FSCACHE_OBJECT_EVENTS -}; - -#define FSCACHE_OBJECT_EVENTS_MASK ((1UL << NR_FSCACHE_OBJECT_EVENTS) - 1) - -/* - * States for object state machine. - */ -struct fscache_transition { - unsigned long events; - const struct fscache_state *transit_to; -}; - -struct fscache_state { - char name[24]; - char short_name[8]; - const struct fscache_state *(*work)(struct fscache_object *object, - int event); - const struct fscache_transition transitions[]; -}; - -/* - * on-disk cache file or index handle - */ -struct fscache_object { - const struct fscache_state *state; /* Object state machine state */ - const struct fscache_transition *oob_table; /* OOB state transition table */ - int debug_id; /* debugging ID */ - int n_children; /* number of child objects */ - int n_ops; /* number of extant ops on object */ - int n_obj_ops; /* number of object ops outstanding on object */ - int n_in_progress; /* number of ops in progress */ - int n_exclusive; /* number of exclusive ops queued or in progress */ - atomic_t n_reads; /* number of read ops in progress */ - spinlock_t lock; /* state and operations lock */ - - unsigned long lookup_jif; /* time at which lookup started */ - unsigned long oob_event_mask; /* OOB events this object is interested in */ - unsigned long event_mask; /* events this object is interested in */ - unsigned long events; /* events to be processed by this object - * (order is important - using fls) */ - - unsigned long flags; -#define FSCACHE_OBJECT_LOCK 0 /* T if object is busy being processed */ -#define FSCACHE_OBJECT_PENDING_WRITE 1 /* T if object has pending write */ -#define FSCACHE_OBJECT_WAITING 2 /* T if object is waiting on its parent */ -#define FSCACHE_OBJECT_IS_LIVE 3 /* T if object is not withdrawn or relinquished */ -#define FSCACHE_OBJECT_IS_LOOKED_UP 4 /* T if object has been looked up */ -#define FSCACHE_OBJECT_IS_AVAILABLE 5 /* T if object has become active */ -#define FSCACHE_OBJECT_RETIRED 6 /* T if object was retired on relinquishment */ -#define FSCACHE_OBJECT_KILLED_BY_CACHE 7 /* T if object was killed by the cache */ -#define FSCACHE_OBJECT_RUN_AFTER_DEAD 8 /* T if object has been dispatched after death */ - - struct list_head cache_link; /* link in cache->object_list */ - struct hlist_node cookie_link; /* link in cookie->backing_objects */ - struct fscache_cache *cache; /* cache that supplied this object */ - struct fscache_cookie *cookie; /* netfs's file/index object */ - struct fscache_object *parent; /* parent object */ - struct work_struct work; /* attention scheduling record */ - struct list_head dependents; /* FIFO of dependent objects */ - struct list_head dep_link; /* link in parent's dependents list */ - struct list_head pending_ops; /* unstarted operations on this object */ - pgoff_t store_limit; /* current storage limit */ - loff_t store_limit_l; /* current storage limit */ -}; - -extern void fscache_object_init(struct fscache_object *, struct fscache_cookie *, - struct fscache_cache *); -extern void fscache_object_destroy(struct fscache_object *); - -extern void fscache_object_lookup_negative(struct fscache_object *object); -extern void fscache_obtained_object(struct fscache_object *object); - -static inline bool fscache_object_is_live(struct fscache_object *object) -{ - return test_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); -} - -static inline bool fscache_object_is_dying(struct fscache_object *object) -{ - return !fscache_object_is_live(object); -} - -static inline bool fscache_object_is_available(struct fscache_object *object) -{ - return test_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags); -} - -static inline bool fscache_cache_is_broken(struct fscache_object *object) -{ - return test_bit(FSCACHE_IOERROR, &object->cache->flags); -} - -static inline bool fscache_object_is_active(struct fscache_object *object) -{ - return fscache_object_is_available(object) && - fscache_object_is_live(object) && - !fscache_cache_is_broken(object); -} - -/** - * fscache_object_destroyed - Note destruction of an object in a cache - * @cache: The cache from which the object came - * - * Note the destruction and deallocation of an object record in a cache. - */ -static inline void fscache_object_destroyed(struct fscache_cache *cache) -{ - if (atomic_dec_and_test(&cache->object_count)) - wake_up_all(&fscache_cache_cleared_wq); -} - -/** - * fscache_object_lookup_error - Note an object encountered an error - * @object: The object on which the error was encountered - * - * Note that an object encountered a fatal error (usually an I/O error) and - * that it should be withdrawn as soon as possible. - */ -static inline void fscache_object_lookup_error(struct fscache_object *object) -{ - set_bit(FSCACHE_OBJECT_EV_ERROR, &object->events); -} - -/** - * fscache_set_store_limit - Set the maximum size to be stored in an object - * @object: The object to set the maximum on - * @i_size: The limit to set in bytes - * - * Set the maximum size an object is permitted to reach, implying the highest - * byte that may be written. Intended to be called by the attr_changed() op. - * - * See Documentation/filesystems/caching/backend-api.rst for a complete - * description. - */ -static inline -void fscache_set_store_limit(struct fscache_object *object, loff_t i_size) -{ - object->store_limit_l = i_size; - object->store_limit = i_size >> PAGE_SHIFT; - if (i_size & ~PAGE_MASK) - object->store_limit++; -} - -/** - * fscache_end_io - End a retrieval operation on a page - * @op: The FS-Cache operation covering the retrieval - * @page: The page that was to be fetched - * @error: The error code (0 if successful) - * - * Note the end of an operation to retrieve a page, as covered by a particular - * operation record. - */ -static inline void fscache_end_io(struct fscache_retrieval *op, - struct page *page, int error) -{ - op->end_io_func(page, op->context, error); -} - -static inline void __fscache_use_cookie(struct fscache_cookie *cookie) -{ - atomic_inc(&cookie->n_active); -} - -/** - * fscache_use_cookie - Request usage of cookie attached to an object - * @object: Object description - * - * Request usage of the cookie attached to an object. NULL is returned if the - * relinquishment had reduced the cookie usage count to 0. - */ -static inline bool fscache_use_cookie(struct fscache_object *object) -{ - struct fscache_cookie *cookie = object->cookie; - return atomic_inc_not_zero(&cookie->n_active) != 0; -} - -static inline bool __fscache_unuse_cookie(struct fscache_cookie *cookie) -{ - return atomic_dec_and_test(&cookie->n_active); -} - -static inline void __fscache_wake_unused_cookie(struct fscache_cookie *cookie) -{ - wake_up_var(&cookie->n_active); -} - -/** - * fscache_unuse_cookie - Cease usage of cookie attached to an object - * @object: Object description - * - * Cease usage of the cookie attached to an object. When the users count - * reaches zero then the cookie relinquishment will be permitted to proceed. - */ -static inline void fscache_unuse_cookie(struct fscache_object *object) -{ - struct fscache_cookie *cookie = object->cookie; - if (__fscache_unuse_cookie(cookie)) - __fscache_wake_unused_cookie(cookie); -} - -/* - * out-of-line cache backend functions - */ -extern __printf(3, 4) -void fscache_init_cache(struct fscache_cache *cache, - const struct fscache_cache_ops *ops, - const char *idfmt, ...); - -extern int fscache_add_cache(struct fscache_cache *cache, - struct fscache_object *fsdef, - const char *tagname); -extern void fscache_withdraw_cache(struct fscache_cache *cache); - -extern void fscache_io_error(struct fscache_cache *cache); - -extern void fscache_mark_page_cached(struct fscache_retrieval *op, - struct page *page); - -extern void fscache_mark_pages_cached(struct fscache_retrieval *op, - struct pagevec *pagevec); - -extern bool fscache_object_sleep_till_congested(signed long *timeoutp); - -extern enum fscache_checkaux fscache_check_aux(struct fscache_object *object, - const void *data, - uint16_t datalen, - loff_t object_size); - -extern void fscache_object_retrying_stale(struct fscache_object *object); - -enum fscache_why_object_killed { - FSCACHE_OBJECT_IS_STALE, - FSCACHE_OBJECT_NO_SPACE, - FSCACHE_OBJECT_WAS_RETIRED, - FSCACHE_OBJECT_WAS_CULLED, -}; -extern void fscache_object_mark_killed(struct fscache_object *object, - enum fscache_why_object_killed why); #endif /* _LINUX_FSCACHE_CACHE_H */ diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 3b2282c157f7..0364a4ca16f6 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -15,861 +15,14 @@ #define _LINUX_FSCACHE_H #include -#include -#include -#include -#include #include #if defined(CONFIG_FSCACHE) || defined(CONFIG_FSCACHE_MODULE) -#define fscache_available() (1) #define fscache_cookie_valid(cookie) (cookie) +#define fscache_cookie_enabled(cookie) (cookie) #else -#define fscache_available() (0) #define fscache_cookie_valid(cookie) (0) +#define fscache_cookie_enabled(cookie) (0) #endif - -/* pattern used to fill dead space in an index entry */ -#define FSCACHE_INDEX_DEADFILL_PATTERN 0x79 - -struct pagevec; -struct fscache_cache_tag; -struct fscache_cookie; -struct fscache_netfs; -struct netfs_read_request; - -typedef void (*fscache_rw_complete_t)(struct page *page, - void *context, - int error); - -/* result of index entry consultation */ -enum fscache_checkaux { - FSCACHE_CHECKAUX_OKAY, /* entry okay as is */ - FSCACHE_CHECKAUX_NEEDS_UPDATE, /* entry requires update */ - FSCACHE_CHECKAUX_OBSOLETE, /* entry requires deletion */ -}; - -/* - * fscache cookie definition - */ -struct fscache_cookie_def { - /* name of cookie type */ - char name[16]; - - /* cookie type */ - uint8_t type; -#define FSCACHE_COOKIE_TYPE_INDEX 0 -#define FSCACHE_COOKIE_TYPE_DATAFILE 1 - - /* select the cache into which to insert an entry in this index - * - optional - * - should return a cache identifier or NULL to cause the cache to be - * inherited from the parent if possible or the first cache picked - * for a non-index file if not - */ - struct fscache_cache_tag *(*select_cache)( - const void *parent_netfs_data, - const void *cookie_netfs_data); - - /* consult the netfs about the state of an object - * - this function can be absent if the index carries no state data - * - the netfs data from the cookie being used as the target is - * presented, as is the auxiliary data and the object size - */ - enum fscache_checkaux (*check_aux)(void *cookie_netfs_data, - const void *data, - uint16_t datalen, - loff_t object_size); - - /* get an extra reference on a read context - * - this function can be absent if the completion function doesn't - * require a context - */ - void (*get_context)(void *cookie_netfs_data, void *context); - - /* release an extra reference on a read context - * - this function can be absent if the completion function doesn't - * require a context - */ - void (*put_context)(void *cookie_netfs_data, void *context); - - /* indicate page that now have cache metadata retained - * - this function should mark the specified page as now being cached - * - the page will have been marked with PG_fscache before this is - * called, so this is optional - */ - void (*mark_page_cached)(void *cookie_netfs_data, - struct address_space *mapping, - struct page *page); -}; - -/* - * fscache cached network filesystem type - * - name, version and ops must be filled in before registration - * - all other fields will be set during registration - */ -struct fscache_netfs { - uint32_t version; /* indexing version */ - const char *name; /* filesystem name */ - struct fscache_cookie *primary_index; -}; - -/* - * data file or index object cookie - * - a file will only appear in one cache - * - a request to cache a file may or may not be honoured, subject to - * constraints such as disk space - * - indices are created on disk just-in-time - */ -struct fscache_cookie { - refcount_t ref; /* number of users of this cookie */ - atomic_t n_children; /* number of children of this cookie */ - atomic_t n_active; /* number of active users of netfs ptrs */ - unsigned int debug_id; - spinlock_t lock; - spinlock_t stores_lock; /* lock on page store tree */ - struct hlist_head backing_objects; /* object(s) backing this file/index */ - const struct fscache_cookie_def *def; /* definition */ - struct fscache_cookie *parent; /* parent of this entry */ - struct hlist_bl_node hash_link; /* Link in hash table */ - struct list_head proc_link; /* Link in proc list */ - void *netfs_data; /* back pointer to netfs */ - struct radix_tree_root stores; /* pages to be stored on this cookie */ -#define FSCACHE_COOKIE_PENDING_TAG 0 /* pages tag: pending write to cache */ -#define FSCACHE_COOKIE_STORING_TAG 1 /* pages tag: writing to cache */ - - unsigned long flags; -#define FSCACHE_COOKIE_LOOKING_UP 0 /* T if non-index cookie being looked up still */ -#define FSCACHE_COOKIE_NO_DATA_YET 1 /* T if new object with no cached data yet */ -#define FSCACHE_COOKIE_UNAVAILABLE 2 /* T if cookie is unavailable (error, etc) */ -#define FSCACHE_COOKIE_INVALIDATING 3 /* T if cookie is being invalidated */ -#define FSCACHE_COOKIE_RELINQUISHED 4 /* T if cookie has been relinquished */ -#define FSCACHE_COOKIE_ENABLED 5 /* T if cookie is enabled */ -#define FSCACHE_COOKIE_ENABLEMENT_LOCK 6 /* T if cookie is being en/disabled */ -#define FSCACHE_COOKIE_AUX_UPDATED 8 /* T if the auxiliary data was updated */ -#define FSCACHE_COOKIE_ACQUIRED 9 /* T if cookie is in use */ -#define FSCACHE_COOKIE_RELINQUISHING 10 /* T if cookie is being relinquished */ - - u8 type; /* Type of object */ - u8 key_len; /* Length of index key */ - u8 aux_len; /* Length of auxiliary data */ - u32 key_hash; /* Hash of parent, type, key, len */ - union { - void *key; /* Index key */ - u8 inline_key[16]; /* - If the key is short enough */ - }; - union { - void *aux; /* Auxiliary data */ - u8 inline_aux[8]; /* - If the aux data is short enough */ - }; -}; - -static inline bool fscache_cookie_enabled(struct fscache_cookie *cookie) -{ - return fscache_cookie_valid(cookie) && test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); -} - -/* - * slow-path functions for when there is actually caching available, and the - * netfs does actually have a valid token - * - these are not to be called directly - * - these are undefined symbols when FS-Cache is not configured and the - * optimiser takes care of not using them - */ -extern int __fscache_register_netfs(struct fscache_netfs *); -extern void __fscache_unregister_netfs(struct fscache_netfs *); -extern struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *); -extern void __fscache_release_cache_tag(struct fscache_cache_tag *); - -extern struct fscache_cookie *__fscache_acquire_cookie( - struct fscache_cookie *, - const struct fscache_cookie_def *, - const void *, size_t, - const void *, size_t, - void *, loff_t, bool); -extern void __fscache_relinquish_cookie(struct fscache_cookie *, const void *, bool); -extern int __fscache_check_consistency(struct fscache_cookie *, const void *); -extern void __fscache_update_cookie(struct fscache_cookie *, const void *); -extern int __fscache_attr_changed(struct fscache_cookie *); -extern void __fscache_invalidate(struct fscache_cookie *); -extern void __fscache_wait_on_invalidate(struct fscache_cookie *); - -#ifdef FSCACHE_USE_NEW_IO_API -extern int __fscache_begin_read_operation(struct netfs_read_request *, struct fscache_cookie *); -#else -extern int __fscache_read_or_alloc_page(struct fscache_cookie *, - struct page *, - fscache_rw_complete_t, - void *, - gfp_t); -extern int __fscache_read_or_alloc_pages(struct fscache_cookie *, - struct address_space *, - struct list_head *, - unsigned *, - fscache_rw_complete_t, - void *, - gfp_t); -extern int __fscache_alloc_page(struct fscache_cookie *, struct page *, gfp_t); -extern int __fscache_write_page(struct fscache_cookie *, struct page *, loff_t, gfp_t); -extern void __fscache_uncache_page(struct fscache_cookie *, struct page *); -extern bool __fscache_check_page_write(struct fscache_cookie *, struct page *); -extern void __fscache_wait_on_page_write(struct fscache_cookie *, struct page *); -extern bool __fscache_maybe_release_page(struct fscache_cookie *, struct page *, - gfp_t); -extern void __fscache_uncache_all_inode_pages(struct fscache_cookie *, - struct inode *); -extern void __fscache_readpages_cancel(struct fscache_cookie *cookie, - struct list_head *pages); -#endif /* FSCACHE_USE_NEW_IO_API */ - -extern void __fscache_disable_cookie(struct fscache_cookie *, const void *, bool); -extern void __fscache_enable_cookie(struct fscache_cookie *, const void *, loff_t, - bool (*)(void *), void *); - -/** - * fscache_register_netfs - Register a filesystem as desiring caching services - * @netfs: The description of the filesystem - * - * Register a filesystem as desiring caching services if they're available. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -int fscache_register_netfs(struct fscache_netfs *netfs) -{ - if (fscache_available()) - return __fscache_register_netfs(netfs); - else - return 0; -} - -/** - * fscache_unregister_netfs - Indicate that a filesystem no longer desires - * caching services - * @netfs: The description of the filesystem - * - * Indicate that a filesystem no longer desires caching services for the - * moment. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -void fscache_unregister_netfs(struct fscache_netfs *netfs) -{ - if (fscache_available()) - __fscache_unregister_netfs(netfs); -} - -/** - * fscache_lookup_cache_tag - Look up a cache tag - * @name: The name of the tag to search for - * - * Acquire a specific cache referral tag that can be used to select a specific - * cache in which to cache an index. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name) -{ - if (fscache_available()) - return __fscache_lookup_cache_tag(name); - else - return NULL; -} - -/** - * fscache_release_cache_tag - Release a cache tag - * @tag: The tag to release - * - * Release a reference to a cache referral tag previously looked up. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -void fscache_release_cache_tag(struct fscache_cache_tag *tag) -{ - if (fscache_available()) - __fscache_release_cache_tag(tag); -} - -/** - * fscache_acquire_cookie - Acquire a cookie to represent a cache object - * @parent: The cookie that's to be the parent of this one - * @def: A description of the cache object, including callback operations - * @index_key: The index key for this cookie - * @index_key_len: Size of the index key - * @aux_data: The auxiliary data for the cookie (may be NULL) - * @aux_data_len: Size of the auxiliary data buffer - * @netfs_data: An arbitrary piece of data to be kept in the cookie to - * represent the cache object to the netfs - * @object_size: The initial size of object - * @enable: Whether or not to enable a data cookie immediately - * - * This function is used to inform FS-Cache about part of an index hierarchy - * that can be used to locate files. This is done by requesting a cookie for - * each index in the path to the file. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -struct fscache_cookie *fscache_acquire_cookie( - struct fscache_cookie *parent, - const struct fscache_cookie_def *def, - const void *index_key, - size_t index_key_len, - const void *aux_data, - size_t aux_data_len, - void *netfs_data, - loff_t object_size, - bool enable) -{ - if (fscache_cookie_valid(parent) && fscache_cookie_enabled(parent)) - return __fscache_acquire_cookie(parent, def, - index_key, index_key_len, - aux_data, aux_data_len, - netfs_data, object_size, enable); - else - return NULL; -} - -/** - * fscache_relinquish_cookie - Return the cookie to the cache, maybe discarding - * it - * @cookie: The cookie being returned - * @aux_data: The updated auxiliary data for the cookie (may be NULL) - * @retire: True if the cache object the cookie represents is to be discarded - * - * This function returns a cookie to the cache, forcibly discarding the - * associated cache object if retire is set to true. The opportunity is - * provided to update the auxiliary data in the cache before the object is - * disconnected. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -void fscache_relinquish_cookie(struct fscache_cookie *cookie, - const void *aux_data, - bool retire) -{ - if (fscache_cookie_valid(cookie)) - __fscache_relinquish_cookie(cookie, aux_data, retire); -} - -/** - * fscache_check_consistency - Request validation of a cache's auxiliary data - * @cookie: The cookie representing the cache object - * @aux_data: The updated auxiliary data for the cookie (may be NULL) - * - * Request an consistency check from fscache, which passes the request to the - * backing cache. The auxiliary data on the cookie will be updated first if - * @aux_data is set. - * - * Returns 0 if consistent and -ESTALE if inconsistent. May also - * return -ENOMEM and -ERESTARTSYS. - */ -static inline -int fscache_check_consistency(struct fscache_cookie *cookie, - const void *aux_data) -{ - if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) - return __fscache_check_consistency(cookie, aux_data); - else - return 0; -} - -/** - * fscache_update_cookie - Request that a cache object be updated - * @cookie: The cookie representing the cache object - * @aux_data: The updated auxiliary data for the cookie (may be NULL) - * - * Request an update of the index data for the cache object associated with the - * cookie. The auxiliary data on the cookie will be updated first if @aux_data - * is set. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -void fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data) -{ - if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) - __fscache_update_cookie(cookie, aux_data); -} - -/** - * fscache_pin_cookie - Pin a data-storage cache object in its cache - * @cookie: The cookie representing the cache object - * - * Permit data-storage cache objects to be pinned in the cache. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -int fscache_pin_cookie(struct fscache_cookie *cookie) -{ - return -ENOBUFS; -} - -/** - * fscache_pin_cookie - Unpin a data-storage cache object in its cache - * @cookie: The cookie representing the cache object - * - * Permit data-storage cache objects to be unpinned from the cache. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -void fscache_unpin_cookie(struct fscache_cookie *cookie) -{ -} - -/** - * fscache_attr_changed - Notify cache that an object's attributes changed - * @cookie: The cookie representing the cache object - * - * Send a notification to the cache indicating that an object's attributes have - * changed. This includes the data size. These attributes will be obtained - * through the get_attr() cookie definition op. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -int fscache_attr_changed(struct fscache_cookie *cookie) -{ - if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) - return __fscache_attr_changed(cookie); - else - return -ENOBUFS; -} - -/** - * fscache_invalidate - Notify cache that an object needs invalidation - * @cookie: The cookie representing the cache object - * - * Notify the cache that an object is needs to be invalidated and that it - * should abort any retrievals or stores it is doing on the cache. The object - * is then marked non-caching until such time as the invalidation is complete. - * - * This can be called with spinlocks held. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -void fscache_invalidate(struct fscache_cookie *cookie) -{ - if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) - __fscache_invalidate(cookie); -} - -/** - * fscache_wait_on_invalidate - Wait for invalidation to complete - * @cookie: The cookie representing the cache object - * - * Wait for the invalidation of an object to complete. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -void fscache_wait_on_invalidate(struct fscache_cookie *cookie) -{ - if (fscache_cookie_valid(cookie)) - __fscache_wait_on_invalidate(cookie); -} - -/** - * fscache_reserve_space - Reserve data space for a cached object - * @cookie: The cookie representing the cache object - * @i_size: The amount of space to be reserved - * - * Reserve an amount of space in the cache for the cache object attached to a - * cookie so that a write to that object within the space can always be - * honoured. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size) -{ - return -ENOBUFS; -} - -#ifdef FSCACHE_USE_NEW_IO_API - -/** - * fscache_begin_read_operation - Begin a read operation for the netfs lib - * @rreq: The read request being undertaken - * @cookie: The cookie representing the cache object - * - * Begin a read operation on behalf of the netfs helper library. @rreq - * indicates the read request to which the operation state should be attached; - * @cookie indicates the cache object that will be accessed. - * - * This is intended to be called from the ->begin_cache_operation() netfs lib - * operation as implemented by the network filesystem. - * - * Returns: - * * 0 - Success - * * -ENOBUFS - No caching available - * * Other error code from the cache, such as -ENOMEM. - */ -static inline -int fscache_begin_read_operation(struct netfs_read_request *rreq, - struct fscache_cookie *cookie) -{ - if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) - return __fscache_begin_read_operation(rreq, cookie); - return -ENOBUFS; -} - -#else /* FSCACHE_USE_NEW_IO_API */ - -/** - * fscache_read_or_alloc_page - Read a page from the cache or allocate a block - * in which to store it - * @cookie: The cookie representing the cache object - * @page: The netfs page to fill if possible - * @end_io_func: The callback to invoke when and if the page is filled - * @context: An arbitrary piece of data to pass on to end_io_func() - * @gfp: The conditions under which memory allocation should be made - * - * Read a page from the cache, or if that's not possible make a potential - * one-block reservation in the cache into which the page may be stored once - * fetched from the server. - * - * If the page is not backed by the cache object, or if it there's some reason - * it can't be, -ENOBUFS will be returned and nothing more will be done for - * that page. - * - * Else, if that page is backed by the cache, a read will be initiated directly - * to the netfs's page and 0 will be returned by this function. The - * end_io_func() callback will be invoked when the operation terminates on a - * completion or failure. Note that the callback may be invoked before the - * return. - * - * Else, if the page is unbacked, -ENODATA is returned and a block may have - * been allocated in the cache. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -int fscache_read_or_alloc_page(struct fscache_cookie *cookie, - struct page *page, - fscache_rw_complete_t end_io_func, - void *context, - gfp_t gfp) -{ - if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) - return __fscache_read_or_alloc_page(cookie, page, end_io_func, - context, gfp); - else - return -ENOBUFS; -} - -/** - * fscache_read_or_alloc_pages - Read pages from the cache and/or allocate - * blocks in which to store them - * @cookie: The cookie representing the cache object - * @mapping: The netfs inode mapping to which the pages will be attached - * @pages: A list of potential netfs pages to be filled - * @nr_pages: Number of pages to be read and/or allocated - * @end_io_func: The callback to invoke when and if each page is filled - * @context: An arbitrary piece of data to pass on to end_io_func() - * @gfp: The conditions under which memory allocation should be made - * - * Read a set of pages from the cache, or if that's not possible, attempt to - * make a potential one-block reservation for each page in the cache into which - * that page may be stored once fetched from the server. - * - * If some pages are not backed by the cache object, or if it there's some - * reason they can't be, -ENOBUFS will be returned and nothing more will be - * done for that pages. - * - * Else, if some of the pages are backed by the cache, a read will be initiated - * directly to the netfs's page and 0 will be returned by this function. The - * end_io_func() callback will be invoked when the operation terminates on a - * completion or failure. Note that the callback may be invoked before the - * return. - * - * Else, if a page is unbacked, -ENODATA is returned and a block may have - * been allocated in the cache. - * - * Because the function may want to return all of -ENOBUFS, -ENODATA and 0 in - * regard to different pages, the return values are prioritised in that order. - * Any pages submitted for reading are removed from the pages list. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -int fscache_read_or_alloc_pages(struct fscache_cookie *cookie, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages, - fscache_rw_complete_t end_io_func, - void *context, - gfp_t gfp) -{ - if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) - return __fscache_read_or_alloc_pages(cookie, mapping, pages, - nr_pages, end_io_func, - context, gfp); - else - return -ENOBUFS; -} - -/** - * fscache_alloc_page - Allocate a block in which to store a page - * @cookie: The cookie representing the cache object - * @page: The netfs page to allocate a page for - * @gfp: The conditions under which memory allocation should be made - * - * Request Allocation a block in the cache in which to store a netfs page - * without retrieving any contents from the cache. - * - * If the page is not backed by a file then -ENOBUFS will be returned and - * nothing more will be done, and no reservation will be made. - * - * Else, a block will be allocated if one wasn't already, and 0 will be - * returned - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -int fscache_alloc_page(struct fscache_cookie *cookie, - struct page *page, - gfp_t gfp) -{ - if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) - return __fscache_alloc_page(cookie, page, gfp); - else - return -ENOBUFS; -} - -/** - * fscache_readpages_cancel - Cancel read/alloc on pages - * @cookie: The cookie representing the inode's cache object. - * @pages: The netfs pages that we canceled write on in readpages() - * - * Uncache/unreserve the pages reserved earlier in readpages() via - * fscache_readpages_or_alloc() and similar. In most successful caches in - * readpages() this doesn't do anything. In cases when the underlying netfs's - * readahead failed we need to clean up the pagelist (unmark and uncache). - * - * This function may sleep as it may have to clean up disk state. - */ -static inline -void fscache_readpages_cancel(struct fscache_cookie *cookie, - struct list_head *pages) -{ - if (fscache_cookie_valid(cookie)) - __fscache_readpages_cancel(cookie, pages); -} - -/** - * fscache_write_page - Request storage of a page in the cache - * @cookie: The cookie representing the cache object - * @page: The netfs page to store - * @object_size: Updated size of object - * @gfp: The conditions under which memory allocation should be made - * - * Request the contents of the netfs page be written into the cache. This - * request may be ignored if no cache block is currently allocated, in which - * case it will return -ENOBUFS. - * - * If a cache block was already allocated, a write will be initiated and 0 will - * be returned. The PG_fscache_write page bit is set immediately and will then - * be cleared at the completion of the write to indicate the success or failure - * of the operation. Note that the completion may happen before the return. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -int fscache_write_page(struct fscache_cookie *cookie, - struct page *page, - loff_t object_size, - gfp_t gfp) -{ - if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) - return __fscache_write_page(cookie, page, object_size, gfp); - else - return -ENOBUFS; -} - -/** - * fscache_uncache_page - Indicate that caching is no longer required on a page - * @cookie: The cookie representing the cache object - * @page: The netfs page that was being cached. - * - * Tell the cache that we no longer want a page to be cached and that it should - * remove any knowledge of the netfs page it may have. - * - * Note that this cannot cancel any outstanding I/O operations between this - * page and the cache. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -void fscache_uncache_page(struct fscache_cookie *cookie, - struct page *page) -{ - if (fscache_cookie_valid(cookie)) - __fscache_uncache_page(cookie, page); -} - -/** - * fscache_check_page_write - Ask if a page is being writing to the cache - * @cookie: The cookie representing the cache object - * @page: The netfs page that is being cached. - * - * Ask the cache if a page is being written to the cache. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -bool fscache_check_page_write(struct fscache_cookie *cookie, - struct page *page) -{ - if (fscache_cookie_valid(cookie)) - return __fscache_check_page_write(cookie, page); - return false; -} - -/** - * fscache_wait_on_page_write - Wait for a page to complete writing to the cache - * @cookie: The cookie representing the cache object - * @page: The netfs page that is being cached. - * - * Ask the cache to wake us up when a page is no longer being written to the - * cache. - * - * See Documentation/filesystems/caching/netfs-api.rst for a complete - * description. - */ -static inline -void fscache_wait_on_page_write(struct fscache_cookie *cookie, - struct page *page) -{ - if (fscache_cookie_valid(cookie)) - __fscache_wait_on_page_write(cookie, page); -} - -/** - * fscache_maybe_release_page - Consider releasing a page, cancelling a store - * @cookie: The cookie representing the cache object - * @page: The netfs page that is being cached. - * @gfp: The gfp flags passed to releasepage() - * - * Consider releasing a page for the vmscan algorithm, on behalf of the netfs's - * releasepage() call. A storage request on the page may cancelled if it is - * not currently being processed. - * - * The function returns true if the page no longer has a storage request on it, - * and false if a storage request is left in place. If true is returned, the - * page will have been passed to fscache_uncache_page(). If false is returned - * the page cannot be freed yet. - */ -static inline -bool fscache_maybe_release_page(struct fscache_cookie *cookie, - struct page *page, - gfp_t gfp) -{ - if (fscache_cookie_valid(cookie) && PageFsCache(page)) - return __fscache_maybe_release_page(cookie, page, gfp); - return true; -} - -/** - * fscache_uncache_all_inode_pages - Uncache all an inode's pages - * @cookie: The cookie representing the inode's cache object. - * @inode: The inode to uncache pages from. - * - * Uncache all the pages in an inode that are marked PG_fscache, assuming them - * to be associated with the given cookie. - * - * This function may sleep. It will wait for pages that are being written out - * and will wait whilst the PG_fscache mark is removed by the cache. - */ -static inline -void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, - struct inode *inode) -{ - if (fscache_cookie_valid(cookie)) - __fscache_uncache_all_inode_pages(cookie, inode); -} - -#endif /* FSCACHE_USE_NEW_IO_API */ - -/** - * fscache_disable_cookie - Disable a cookie - * @cookie: The cookie representing the cache object - * @aux_data: The updated auxiliary data for the cookie (may be NULL) - * @invalidate: Invalidate the backing object - * - * Disable a cookie from accepting further alloc, read, write, invalidate, - * update or acquire operations. Outstanding operations can still be waited - * upon and pages can still be uncached and the cookie relinquished. - * - * This will not return until all outstanding operations have completed. - * - * If @invalidate is set, then the backing object will be invalidated and - * detached, otherwise it will just be detached. - * - * If @aux_data is set, then auxiliary data will be updated from that. - */ -static inline -void fscache_disable_cookie(struct fscache_cookie *cookie, - const void *aux_data, - bool invalidate) -{ - if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie)) - __fscache_disable_cookie(cookie, aux_data, invalidate); -} - -/** - * fscache_enable_cookie - Reenable a cookie - * @cookie: The cookie representing the cache object - * @aux_data: The updated auxiliary data for the cookie (may be NULL) - * @object_size: Current size of object - * @can_enable: A function to permit enablement once lock is held - * @data: Data for can_enable() - * - * Reenable a previously disabled cookie, allowing it to accept further alloc, - * read, write, invalidate, update or acquire operations. An attempt will be - * made to immediately reattach the cookie to a backing object. If @aux_data - * is set, the auxiliary data attached to the cookie will be updated. - * - * The can_enable() function is called (if not NULL) once the enablement lock - * is held to rule on whether enablement is still permitted to go ahead. - */ -static inline -void fscache_enable_cookie(struct fscache_cookie *cookie, - const void *aux_data, - loff_t object_size, - bool (*can_enable)(void *data), - void *data) -{ - if (fscache_cookie_valid(cookie) && !fscache_cookie_enabled(cookie)) - __fscache_enable_cookie(cookie, aux_data, object_size, - can_enable, data); -} - #endif /* _LINUX_FSCACHE_H */ -- cgit v1.2.3 From a39c41b853ee51f4dcd19f5556f860ae8e2f23d3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 14:30:37 +0100 Subject: netfs: Pass a flag to ->prepare_write() to say if there's no alloc'd space Pass a flag to ->prepare_write() to indicate if there's definitely no space allocated in the cache yet (for instance if we've already checked as we were asked to do a read). Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819583123.215744.12783808230464471417.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906886835.143852.6689886781122679769.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967079100.1823006.12889542712309574359.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021489334.640689.3131206613015409076.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/netfs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index ca0683b9e3d1..1ea22fc48818 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -232,7 +232,8 @@ struct netfs_cache_ops { * actually do. */ int (*prepare_write)(struct netfs_cache_resources *cres, - loff_t *_start, size_t *_len, loff_t i_size); + loff_t *_start, size_t *_len, loff_t i_size, + bool no_space_allocated_yet); }; struct readahead_control; -- cgit v1.2.3 From 1e1236b841166f1d2daf36fdf6bb3e656bc5f5ca Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 14:34:41 +0100 Subject: fscache: Introduce new driver Introduce basic skeleton of the new, rewritten fscache driver. Changes ======= ver #3: - Use remove_proc_subtree(), not remove_proc_entry() to remove a populated dir. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819584034.215744.4290533472390439030.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906887770.143852.3577888294989185666.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967080039.1823006.5702921801104057922.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021491014.640689.4292699878317589512.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache-cache.h | 2 ++ include/linux/fscache.h | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 47f21a53ac4b..d6910a913918 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -16,4 +16,6 @@ #include +extern struct workqueue_struct *fscache_wq; + #endif /* _LINUX_FSCACHE_CACHE_H */ diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 0364a4ca16f6..1cf90c252aac 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* General filesystem caching interface * - * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * NOTE!!! See: @@ -18,9 +18,13 @@ #include #if defined(CONFIG_FSCACHE) || defined(CONFIG_FSCACHE_MODULE) +#define __fscache_available (1) +#define fscache_available() (1) #define fscache_cookie_valid(cookie) (cookie) #define fscache_cookie_enabled(cookie) (cookie) #else +#define __fscache_available (0) +#define fscache_available() (0) #define fscache_cookie_valid(cookie) (0) #define fscache_cookie_enabled(cookie) (0) #endif -- cgit v1.2.3 From 9549332df4ed4e761a1d41c83f2c25d28bb22431 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 15:00:26 +0100 Subject: fscache: Implement cache registration Implement a register of caches and provide functions to manage it. Two functions are provided for the cache backend to use: (1) Acquire a cache cookie: struct fscache_cache *fscache_acquire_cache(const char *name) This gets the cache cookie for a cache of the specified name and moves it to the preparation state. If a nameless cache cookie exists, that will be given this name and used. (2) Relinquish a cache cookie: void fscache_relinquish_cache(struct fscache_cache *cache); This relinquishes a cache cookie, cleans it and makes it available if it's still referenced by a network filesystem. Note that network filesystems don't deal with cache cookies directly, but rather go straight to the volume registration. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819587157.215744.13523139317322503286.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906889665.143852.10378009165231294456.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967085081.1823006.2218944206363626210.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021494847.640689.10109692261640524343.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache-cache.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index d6910a913918..18cd5c9877bb 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -16,6 +16,40 @@ #include +enum fscache_cache_trace; +enum fscache_access_trace; + +enum fscache_cache_state { + FSCACHE_CACHE_IS_NOT_PRESENT, /* No cache is present for this name */ + FSCACHE_CACHE_IS_PREPARING, /* A cache is preparing to come live */ + FSCACHE_CACHE_IS_ACTIVE, /* Attached cache is active and can be used */ + FSCACHE_CACHE_GOT_IOERROR, /* Attached cache stopped on I/O error */ + FSCACHE_CACHE_IS_WITHDRAWN, /* Attached cache is being withdrawn */ +#define NR__FSCACHE_CACHE_STATE (FSCACHE_CACHE_IS_WITHDRAWN + 1) +}; + +/* + * Cache cookie. + */ +struct fscache_cache { + struct list_head cache_link; /* Link in cache list */ + void *cache_priv; /* Private cache data (or NULL) */ + refcount_t ref; + atomic_t n_volumes; /* Number of active volumes; */ + atomic_t n_accesses; /* Number of in-progress accesses on the cache */ + atomic_t object_count; /* no. of live objects in this cache */ + unsigned int debug_id; + enum fscache_cache_state state; + char *name; +}; + extern struct workqueue_struct *fscache_wq; +/* + * out-of-line cache backend functions + */ +extern struct rw_semaphore fscache_addremove_sem; +extern struct fscache_cache *fscache_acquire_cache(const char *name); +extern void fscache_relinquish_cache(struct fscache_cache *cache); + #endif /* _LINUX_FSCACHE_CACHE_H */ -- cgit v1.2.3 From 62ab63352350e881ae693a8236b35d7d0516c78b Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 15:26:17 +0100 Subject: fscache: Implement volume registration Add functions to the fscache API to allow volumes to be acquired and relinquished by the network filesystem. A volume is an index of data storage cache objects. A volume is represented by a volume cookie in the API. A filesystem would typically create a volume for a superblock and then create per-inode cookies within it. To request a volume, the filesystem calls: struct fscache_volume * fscache_acquire_volume(const char *volume_key, const char *cache_name, const void *coherency_data, size_t coherency_len) The volume_key is a printable string used to match the volume in the cache. It should not contain any '/' characters. For AFS, for example, this would be "afs,,", e.g. "afs,example.com,523001". The cache_name can be NULL, but if not it should be a string indicating the name of the cache to use if there's more than one available. The coherency data, if given, is an arbitrarily-sized blob that's attached to the volume and is compared when the volume is looked up. If it doesn't match, the old volume is judged to be out of date and it and everything within it is discarded. Acquiring a volume twice concurrently is disallowed, though the function will wait if an old volume cookie is being relinquishing. When a network filesystem has finished with a volume, it should return the volume cookie by calling: void fscache_relinquish_volume(struct fscache_volume *volume, const void *coherency_data, bool invalidate) If invalidate is true, the entire volume will be discarded; if false, the volume will be synced and the coherency data will be updated. Changes ======= ver #4: - Removed an extraneous param from kdoc on fscache_relinquish_volume()[3]. ver #3: - fscache_hash()'s size parameter is now in bytes. Use __le32 as the unit to round up to. - When comparing cookies, simply see if the attributes are the same rather than subtracting them to produce a strcmp-style return[2]. - Make the coherency data an arbitrary blob rather than a u64, but don't store it for the moment. ver #2: - Fix error check[1]. - Make a fscache_acquire_volume() return errors, including EBUSY if a conflicting volume cookie already exists. No error is printed now - that's left to the netfs. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/20211203095608.GC2480@kili/ [1] Link: https://lore.kernel.org/r/CAHk-=whtkzB446+hX0zdLsdcUJsJ=8_-0S1mE_R+YurThfUbLA@mail.gmail.com/ [2] Link: https://lore.kernel.org/r/20211220224646.30e8205c@canb.auug.org.au/ [3] Link: https://lore.kernel.org/r/163819588944.215744.1629085755564865996.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906890630.143852.13972180614535611154.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967086836.1823006.8191672796841981763.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021495816.640689.4403156093668590217.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache.h | 84 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 1cf90c252aac..131a741a6652 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -20,13 +20,97 @@ #if defined(CONFIG_FSCACHE) || defined(CONFIG_FSCACHE_MODULE) #define __fscache_available (1) #define fscache_available() (1) +#define fscache_volume_valid(volume) (volume) #define fscache_cookie_valid(cookie) (cookie) #define fscache_cookie_enabled(cookie) (cookie) #else #define __fscache_available (0) #define fscache_available() (0) +#define fscache_volume_valid(volume) (0) #define fscache_cookie_valid(cookie) (0) #define fscache_cookie_enabled(cookie) (0) #endif +/* + * Volume representation cookie. + */ +struct fscache_volume { + refcount_t ref; + atomic_t n_cookies; /* Number of data cookies in volume */ + atomic_t n_accesses; /* Number of cache accesses in progress */ + unsigned int debug_id; + unsigned int key_hash; /* Hash of key string */ + char *key; /* Volume ID, eg. "afs@example.com@1234" */ + struct list_head proc_link; /* Link in /proc/fs/fscache/volumes */ + struct hlist_bl_node hash_link; /* Link in hash table */ + struct work_struct work; + struct fscache_cache *cache; /* The cache in which this resides */ + void *cache_priv; /* Cache private data */ + spinlock_t lock; + unsigned long flags; +#define FSCACHE_VOLUME_RELINQUISHED 0 /* Volume is being cleaned up */ +#define FSCACHE_VOLUME_INVALIDATE 1 /* Volume was invalidated */ +#define FSCACHE_VOLUME_COLLIDED_WITH 2 /* Volume was collided with */ +#define FSCACHE_VOLUME_ACQUIRE_PENDING 3 /* Volume is waiting to complete acquisition */ +#define FSCACHE_VOLUME_CREATING 4 /* Volume is being created on disk */ +}; + +/* + * slow-path functions for when there is actually caching available, and the + * netfs does actually have a valid token + * - these are not to be called directly + * - these are undefined symbols when FS-Cache is not configured and the + * optimiser takes care of not using them + */ +extern struct fscache_volume *__fscache_acquire_volume(const char *, const char *, + const void *, size_t); +extern void __fscache_relinquish_volume(struct fscache_volume *, const void *, bool); + +/** + * fscache_acquire_volume - Register a volume as desiring caching services + * @volume_key: An identification string for the volume + * @cache_name: The name of the cache to use (or NULL for the default) + * @coherency_data: Piece of arbitrary coherency data to check (or NULL) + * @coherency_len: The size of the coherency data + * + * Register a volume as desiring caching services if they're available. The + * caller must provide an identifier for the volume and may also indicate which + * cache it should be in. If a preexisting volume entry is found in the cache, + * the coherency data must match otherwise the entry will be invalidated. + * + * Returns a cookie pointer on success, -ENOMEM if out of memory or -EBUSY if a + * cache volume of that name is already acquired. Note that "NULL" is a valid + * cookie pointer and can be returned if caching is refused. + */ +static inline +struct fscache_volume *fscache_acquire_volume(const char *volume_key, + const char *cache_name, + const void *coherency_data, + size_t coherency_len) +{ + if (!fscache_available()) + return NULL; + return __fscache_acquire_volume(volume_key, cache_name, + coherency_data, coherency_len); +} + +/** + * fscache_relinquish_volume - Cease caching a volume + * @volume: The volume cookie + * @coherency_data: Piece of arbitrary coherency data to set (or NULL) + * @invalidate: True if the volume should be invalidated + * + * Indicate that a filesystem no longer desires caching services for a volume. + * The caller must have relinquished all file cookies prior to calling this. + * The stored coherency data is updated. + */ +static inline +void fscache_relinquish_volume(struct fscache_volume *volume, + const void *coherency_data, + bool invalidate) +{ + if (fscache_volume_valid(volume)) + __fscache_relinquish_volume(volume, coherency_data, invalidate); +} + #endif /* _LINUX_FSCACHE_H */ -- cgit v1.2.3 From 7f3283aba39a0f395700c3b5defa4ec49d9914b3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 15:53:34 +0100 Subject: fscache: Implement cookie registration Add functions to the fscache API to allow data file cookies to be acquired and relinquished by the network filesystem. It is intended that the filesystem will create such cookies per-inode under a volume. To request a cookie, the filesystem should call: struct fscache_cookie * fscache_acquire_cookie(struct fscache_volume *volume, u8 advice, const void *index_key, size_t index_key_len, const void *aux_data, size_t aux_data_len, loff_t object_size) The filesystem must first have created a volume cookie, which is passed in here. If it passes in NULL then the function will just return a NULL cookie. A binary key should be passed in index_key and is of size index_key_len. This is saved in the cookie and is used to locate the associated data in the cache. A coherency data buffer of size aux_data_len will be allocated and initialised from the buffer pointed to by aux_data. This is used to validate cache objects when they're opened and is stored on disk with them when they're committed. The data is stored in the cookie and will be updateable by various functions in later patches. The object_size must also be given. This is also used to perform a coherency check and to size the backing storage appropriately. This function disallows a cookie from being acquired twice in parallel, though it will cause the second user to wait if the first is busy relinquishing its cookie. When a network filesystem has finished with a cookie, it should call: void fscache_relinquish_cookie(struct fscache_volume *volume, bool retire) If retire is true, any backing data will be discarded immediately. Changes ======= ver #3: - fscache_hash()'s size parameter is now in bytes. Use __le32 as the unit to round up to. - When comparing cookies, simply see if the attributes are the same rather than subtracting them to produce a strcmp-style return[1]. - Add a check to see if the cookie is still hashed at the point of freeing. ver #2: - Don't hold n_accesses elevated whilst cache is bound to a cookie, but rather add a flag that prevents the state machine from being queued when n_accesses reaches 0. - Remove the unused cookie pointer field from the fscache_acquire tracepoint. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/CAHk-=whtkzB446+hX0zdLsdcUJsJ=8_-0S1mE_R+YurThfUbLA@mail.gmail.com/ [1] Link: https://lore.kernel.org/r/163819590658.215744.14934902514281054323.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906891983.143852.6219772337558577395.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967088507.1823006.12659006350221417165.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021498432.640689.12743483856927722772.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache-cache.h | 22 +++++++ include/linux/fscache.h | 134 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 18cd5c9877bb..c4355b888c91 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -17,6 +17,7 @@ #include enum fscache_cache_trace; +enum fscache_cookie_trace; enum fscache_access_trace; enum fscache_cache_state { @@ -52,4 +53,25 @@ extern struct rw_semaphore fscache_addremove_sem; extern struct fscache_cache *fscache_acquire_cache(const char *name); extern void fscache_relinquish_cache(struct fscache_cache *cache); +extern struct fscache_cookie *fscache_get_cookie(struct fscache_cookie *cookie, + enum fscache_cookie_trace where); +extern void fscache_put_cookie(struct fscache_cookie *cookie, + enum fscache_cookie_trace where); +extern void fscache_set_cookie_state(struct fscache_cookie *cookie, + enum fscache_cookie_state state); + +/** + * fscache_get_key - Get a pointer to the cookie key + * @cookie: The cookie to query + * + * Return a pointer to the where a cookie's key is stored. + */ +static inline void *fscache_get_key(struct fscache_cookie *cookie) +{ + if (cookie->key_len <= sizeof(cookie->inline_key)) + return cookie->inline_key; + else + return cookie->key; +} + #endif /* _LINUX_FSCACHE_CACHE_H */ diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 131a741a6652..4450d17c11e8 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -31,6 +31,27 @@ #define fscache_cookie_enabled(cookie) (0) #endif +struct fscache_cookie; + +#define FSCACHE_ADV_SINGLE_CHUNK 0x01 /* The object is a single chunk of data */ +#define FSCACHE_ADV_WRITE_CACHE 0x00 /* Do cache if written to locally */ +#define FSCACHE_ADV_WRITE_NOCACHE 0x02 /* Don't cache if written to locally */ + +/* + * Data object state. + */ +enum fscache_cookie_state { + FSCACHE_COOKIE_STATE_QUIESCENT, /* The cookie is uncached */ + FSCACHE_COOKIE_STATE_LOOKING_UP, /* The cache object is being looked up */ + FSCACHE_COOKIE_STATE_CREATING, /* The cache object is being created */ + FSCACHE_COOKIE_STATE_ACTIVE, /* The cache is active, readable and writable */ + FSCACHE_COOKIE_STATE_FAILED, /* The cache failed, withdraw to clear */ + FSCACHE_COOKIE_STATE_WITHDRAWING, /* The cookie is being withdrawn */ + FSCACHE_COOKIE_STATE_RELINQUISHING, /* The cookie is being relinquished */ + FSCACHE_COOKIE_STATE_DROPPED, /* The cookie has been dropped */ +#define FSCACHE_COOKIE_STATE__NR (FSCACHE_COOKIE_STATE_DROPPED + 1) +} __attribute__((mode(byte))); + /* * Volume representation cookie. */ @@ -55,6 +76,60 @@ struct fscache_volume { #define FSCACHE_VOLUME_CREATING 4 /* Volume is being created on disk */ }; +/* + * Data file representation cookie. + * - a file will only appear in one cache + * - a request to cache a file may or may not be honoured, subject to + * constraints such as disk space + * - indices are created on disk just-in-time + */ +struct fscache_cookie { + refcount_t ref; + atomic_t n_active; /* number of active users of cookie */ + atomic_t n_accesses; /* Number of cache accesses in progress */ + unsigned int debug_id; + unsigned int inval_counter; /* Number of invalidations made */ + spinlock_t lock; + struct fscache_volume *volume; /* Parent volume of this file. */ + void *cache_priv; /* Cache-side representation */ + struct hlist_bl_node hash_link; /* Link in hash table */ + struct list_head proc_link; /* Link in proc list */ + struct list_head commit_link; /* Link in commit queue */ + struct work_struct work; /* Commit/relinq/withdraw work */ + loff_t object_size; /* Size of the netfs object */ + unsigned long unused_at; /* Time at which unused (jiffies) */ + unsigned long flags; +#define FSCACHE_COOKIE_RELINQUISHED 0 /* T if cookie has been relinquished */ +#define FSCACHE_COOKIE_RETIRED 1 /* T if this cookie has retired on relinq */ +#define FSCACHE_COOKIE_IS_CACHING 2 /* T if this cookie is cached */ +#define FSCACHE_COOKIE_NO_DATA_TO_READ 3 /* T if this cookie has nothing to read */ +#define FSCACHE_COOKIE_NEEDS_UPDATE 4 /* T if attrs have been updated */ +#define FSCACHE_COOKIE_HAS_BEEN_CACHED 5 /* T if cookie needs withdraw-on-relinq */ +#define FSCACHE_COOKIE_DISABLED 6 /* T if cookie has been disabled */ +#define FSCACHE_COOKIE_LOCAL_WRITE 7 /* T if cookie has been modified locally */ +#define FSCACHE_COOKIE_NO_ACCESS_WAKE 8 /* T if no wake when n_accesses goes 0 */ +#define FSCACHE_COOKIE_DO_RELINQUISH 9 /* T if this cookie needs relinquishment */ +#define FSCACHE_COOKIE_DO_WITHDRAW 10 /* T if this cookie needs withdrawing */ +#define FSCACHE_COOKIE_DO_LRU_DISCARD 11 /* T if this cookie needs LRU discard */ +#define FSCACHE_COOKIE_DO_PREP_TO_WRITE 12 /* T if cookie needs write preparation */ +#define FSCACHE_COOKIE_HAVE_DATA 13 /* T if this cookie has data stored */ +#define FSCACHE_COOKIE_IS_HASHED 14 /* T if this cookie is hashed */ + + enum fscache_cookie_state state; + u8 advice; /* FSCACHE_ADV_* */ + u8 key_len; /* Length of index key */ + u8 aux_len; /* Length of auxiliary data */ + u32 key_hash; /* Hash of volume, key, len */ + union { + void *key; /* Index key */ + u8 inline_key[16]; /* - If the key is short enough */ + }; + union { + void *aux; /* Auxiliary data */ + u8 inline_aux[8]; /* - If the aux data is short enough */ + }; +}; + /* * slow-path functions for when there is actually caching available, and the * netfs does actually have a valid token @@ -66,6 +141,14 @@ extern struct fscache_volume *__fscache_acquire_volume(const char *, const char const void *, size_t); extern void __fscache_relinquish_volume(struct fscache_volume *, const void *, bool); +extern struct fscache_cookie *__fscache_acquire_cookie( + struct fscache_volume *, + u8, + const void *, size_t, + const void *, size_t, + loff_t); +extern void __fscache_relinquish_cookie(struct fscache_cookie *, bool); + /** * fscache_acquire_volume - Register a volume as desiring caching services * @volume_key: An identification string for the volume @@ -113,4 +196,55 @@ void fscache_relinquish_volume(struct fscache_volume *volume, __fscache_relinquish_volume(volume, coherency_data, invalidate); } +/** + * fscache_acquire_cookie - Acquire a cookie to represent a cache object + * @volume: The volume in which to locate/create this cookie + * @advice: Advice flags (FSCACHE_COOKIE_ADV_*) + * @index_key: The index key for this cookie + * @index_key_len: Size of the index key + * @aux_data: The auxiliary data for the cookie (may be NULL) + * @aux_data_len: Size of the auxiliary data buffer + * @object_size: The initial size of object + * + * Acquire a cookie to represent a data file within the given cache volume. + * + * See Documentation/filesystems/caching/netfs-api.rst for a complete + * description. + */ +static inline +struct fscache_cookie *fscache_acquire_cookie(struct fscache_volume *volume, + u8 advice, + const void *index_key, + size_t index_key_len, + const void *aux_data, + size_t aux_data_len, + loff_t object_size) +{ + if (!fscache_volume_valid(volume)) + return NULL; + return __fscache_acquire_cookie(volume, advice, + index_key, index_key_len, + aux_data, aux_data_len, + object_size); +} + +/** + * fscache_relinquish_cookie - Return the cookie to the cache, maybe discarding + * it + * @cookie: The cookie being returned + * @retire: True if the cache object the cookie represents is to be discarded + * + * This function returns a cookie to the cache, forcibly discarding the + * associated cache object if retire is set to true. + * + * See Documentation/filesystems/caching/netfs-api.rst for a complete + * description. + */ +static inline +void fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) +{ + if (fscache_cookie_valid(cookie)) + __fscache_relinquish_cookie(cookie, retire); +} + #endif /* _LINUX_FSCACHE_H */ -- cgit v1.2.3 From e6acd3299badbfb5fb0231d42481d4f5dedf5599 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 15:26:17 +0100 Subject: fscache: Implement volume-level access helpers Add a pair of helper functions to manage access to a volume, pinning the volume in place for the duration to prevent cache withdrawal from removing it: bool fscache_begin_volume_access(struct fscache_volume *volume, enum fscache_access_trace why); void fscache_end_volume_access(struct fscache_volume *volume, enum fscache_access_trace why); The way the access gate on the volume works/will work is: (1) If the cache tests as not live (state is not FSCACHE_CACHE_IS_ACTIVE), then we return false to indicate access was not permitted. (2) If the cache tests as live, then we increment the volume's n_accesses count and then recheck the cache liveness, ending the access if it ceased to be live. (3) When we end the access, we decrement the volume's n_accesses and wake up the any waiters if it reaches 0. (4) Whilst the cache is caching, the volume's n_accesses is kept artificially incremented to prevent wakeups from happening. (5) When the cache is taken offline, the state is changed to prevent new accesses, the volume's n_accesses is decremented and we wait for it to become 0. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819594158.215744.8285859817391683254.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906894315.143852.5454793807544710479.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967095028.1823006.9173132503876627466.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021501546.640689.9631510472149608443.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache-cache.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index c4355b888c91..fbbd8a2afe12 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -53,6 +53,10 @@ extern struct rw_semaphore fscache_addremove_sem; extern struct fscache_cache *fscache_acquire_cache(const char *name); extern void fscache_relinquish_cache(struct fscache_cache *cache); +extern void fscache_end_volume_access(struct fscache_volume *volume, + struct fscache_cookie *cookie, + enum fscache_access_trace why); + extern struct fscache_cookie *fscache_get_cookie(struct fscache_cookie *cookie, enum fscache_cookie_trace where); extern void fscache_put_cookie(struct fscache_cookie *cookie, -- cgit v1.2.3 From a7733fb632722a2f085f9324f14783effe268ed3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 15:53:34 +0100 Subject: fscache: Implement cookie-level access helpers Add a number of helper functions to manage access to a cookie, pinning the cache object in place for the duration to prevent cache withdrawal from removing it: (1) void fscache_init_access_gate(struct fscache_cookie *cookie); This function initialises the access count when a cache binds to a cookie. An extra ref is taken on the access count to prevent wakeups while the cache is active. We're only interested in the wakeup when a cookie is being withdrawn and we're waiting for it to quiesce - at which point the counter will be decremented before the wait. The FSCACHE_COOKIE_NACC_ELEVATED flag is set on the cookie to keep track of the extra ref in order to handle a race between relinquishment and withdrawal both trying to drop the extra ref. (2) bool fscache_begin_cookie_access(struct fscache_cookie *cookie, enum fscache_access_trace why); This function attempts to begin access upon a cookie, pinning it in place if it's cached. If successful, it returns true and leaves a the access count incremented. (3) void fscache_end_cookie_access(struct fscache_cookie *cookie, enum fscache_access_trace why); This function drops the access count obtained by (2), permitting object withdrawal to take place when it reaches zero. A tracepoint is provided to track changes to the access counter on a cookie. Changes ======= ver #2: - Don't hold n_accesses elevated whilst cache is bound to a cookie, but rather add a flag that prevents the state machine from being queued when n_accesses reaches 0. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819595085.215744.1706073049250505427.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906895313.143852.10141619544149102193.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967095980.1823006.1133648159424418877.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021503063.640689.8870918985269528670.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache-cache.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index fbbd8a2afe12..66624407ba84 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -61,6 +61,8 @@ extern struct fscache_cookie *fscache_get_cookie(struct fscache_cookie *cookie, enum fscache_cookie_trace where); extern void fscache_put_cookie(struct fscache_cookie *cookie, enum fscache_cookie_trace where); +extern void fscache_end_cookie_access(struct fscache_cookie *cookie, + enum fscache_access_trace why); extern void fscache_set_cookie_state(struct fscache_cookie *cookie, enum fscache_cookie_state state); -- cgit v1.2.3 From 2e0c76aee25f33c482abda6224bd87732359354d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 15:00:26 +0100 Subject: fscache: Implement functions add/remove a cache Implement functions to allow the cache backend to add or remove a cache: (1) Declare a cache to be live: int fscache_add_cache(struct fscache_cache *cache, const struct fscache_cache_ops *ops, void *cache_priv); Take a previously acquired cache cookie, set the operations table and private data and mark the cache open for access. (2) Withdraw a cache from service: void fscache_withdraw_cache(struct fscache_cache *cache); This marks the cache as withdrawn and thus prevents further cache-level and volume-level accesses. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819596022.215744.8799712491432238827.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906896599.143852.17049208999019262884.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967097870.1823006.3470041000971522030.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021505541.640689.1819714759326331054.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache-cache.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 66624407ba84..f78add6e7823 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -33,6 +33,7 @@ enum fscache_cache_state { * Cache cookie. */ struct fscache_cache { + const struct fscache_cache_ops *ops; struct list_head cache_link; /* Link in cache list */ void *cache_priv; /* Private cache data (or NULL) */ refcount_t ref; @@ -44,6 +45,14 @@ struct fscache_cache { char *name; }; +/* + * cache operations + */ +struct fscache_cache_ops { + /* name of cache provider */ + const char *name; +}; + extern struct workqueue_struct *fscache_wq; /* @@ -52,6 +61,10 @@ extern struct workqueue_struct *fscache_wq; extern struct rw_semaphore fscache_addremove_sem; extern struct fscache_cache *fscache_acquire_cache(const char *name); extern void fscache_relinquish_cache(struct fscache_cache *cache); +extern int fscache_add_cache(struct fscache_cache *cache, + const struct fscache_cache_ops *ops, + void *cache_priv); +extern void fscache_withdraw_cache(struct fscache_cache *cache); extern void fscache_end_volume_access(struct fscache_volume *volume, struct fscache_cookie *cookie, -- cgit v1.2.3 From bfa22da3ed652aa15acd4246fa13a0de6dbe4a59 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 15:26:17 +0100 Subject: fscache: Provide and use cache methods to lookup/create/free a volume Add cache methods to lookup, create and remove a volume. Looking up or creating the volume requires the cache pinning for access; freeing the volume requires the volume pinning for access. The ->acquire_volume() method is used to ask the cache backend to lookup and, if necessary, create a volume; the ->free_volume() method is used to free the resources for a volume. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819597821.215744.5225318658134989949.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906898645.143852.8537799955945956818.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967099771.1823006.1455197910571061835.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021507345.640689.4073511598838843040.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache-cache.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index f78add6e7823..a10b66ca3544 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -51,6 +51,12 @@ struct fscache_cache { struct fscache_cache_ops { /* name of cache provider */ const char *name; + + /* Acquire a volume */ + void (*acquire_volume)(struct fscache_volume *volume); + + /* Free the cache's data attached to a volume */ + void (*free_volume)(struct fscache_volume *volume); }; extern struct workqueue_struct *fscache_wq; @@ -65,6 +71,7 @@ extern int fscache_add_cache(struct fscache_cache *cache, const struct fscache_cache_ops *ops, void *cache_priv); extern void fscache_withdraw_cache(struct fscache_cache *cache); +extern void fscache_withdraw_volume(struct fscache_volume *volume); extern void fscache_end_volume_access(struct fscache_volume *volume, struct fscache_cookie *cookie, -- cgit v1.2.3 From 29f18e79fe7c5f8011befeda9be6b220a350f947 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 15:00:26 +0100 Subject: fscache: Add a function for a cache backend to note an I/O error Add a function to the backend API to note an I/O error in a cache. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819598741.215744.891281275151382095.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906901316.143852.15225412215771586528.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967100721.1823006.16435671567428949398.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021508840.640689.11902836226570620424.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache-cache.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index a10b66ca3544..936ef731bbc7 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -73,6 +73,8 @@ extern int fscache_add_cache(struct fscache_cache *cache, extern void fscache_withdraw_cache(struct fscache_cache *cache); extern void fscache_withdraw_volume(struct fscache_volume *volume); +extern void fscache_io_error(struct fscache_cache *cache); + extern void fscache_end_volume_access(struct fscache_volume *volume, struct fscache_cookie *cookie, enum fscache_access_trace why); -- cgit v1.2.3 From 5d00e426f95e7ea036fec2a0aceb3f71d6dbdf92 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 15:53:34 +0100 Subject: fscache: Implement simple cookie state machine Implement a very simple cookie state machine to handle lookup, invalidation, withdrawal, relinquishment and, to be added later, commit on LRU discard. Three cache methods are provided: ->lookup_cookie() to look up and, if necessary, create a data storage object; ->withdraw_cookie() to free the resources associated with that object and potentially delete it; and ->prepare_to_write(), to do prepare for changes to the cached data to be modified locally. Changes ======= ver #3: - Fix a race between LRU discard and relinquishment whereby the former would override the latter and thus the latter would never happen[1]. ver #2: - Don't hold n_accesses elevated whilst cache is bound to a cookie, but rather add a flag that prevents the state machine from being queued when n_accesses reaches 0. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/599331.1639410068@warthog.procyon.org.uk/ [1] Link: https://lore.kernel.org/r/163819599657.215744.15799615296912341745.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906903925.143852.1805855338154353867.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967105456.1823006.14730395299835841776.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021510706.640689.7961423370243272583.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache-cache.h | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 936ef731bbc7..ae6a75976450 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -57,6 +57,15 @@ struct fscache_cache_ops { /* Free the cache's data attached to a volume */ void (*free_volume)(struct fscache_volume *volume); + + /* Look up a cookie in the cache */ + bool (*lookup_cookie)(struct fscache_cookie *cookie); + + /* Withdraw an object without any cookie access counts held */ + void (*withdraw_cookie)(struct fscache_cookie *cookie); + + /* Prepare to write to a live cache object */ + void (*prepare_to_write)(struct fscache_cookie *cookie); }; extern struct workqueue_struct *fscache_wq; @@ -72,6 +81,7 @@ extern int fscache_add_cache(struct fscache_cache *cache, void *cache_priv); extern void fscache_withdraw_cache(struct fscache_cache *cache); extern void fscache_withdraw_volume(struct fscache_volume *volume); +extern void fscache_withdraw_cookie(struct fscache_cookie *cookie); extern void fscache_io_error(struct fscache_cache *cache); @@ -85,8 +95,21 @@ extern void fscache_put_cookie(struct fscache_cookie *cookie, enum fscache_cookie_trace where); extern void fscache_end_cookie_access(struct fscache_cookie *cookie, enum fscache_access_trace why); -extern void fscache_set_cookie_state(struct fscache_cookie *cookie, - enum fscache_cookie_state state); +extern void fscache_cookie_lookup_negative(struct fscache_cookie *cookie); +extern void fscache_caching_failed(struct fscache_cookie *cookie); + +/** + * fscache_cookie_state - Read the state of a cookie + * @cookie: The cookie to query + * + * Get the state of a cookie, imposing an ordering between the cookie contents + * and the state value. Paired with fscache_set_cookie_state(). + */ +static inline +enum fscache_cookie_state fscache_cookie_state(struct fscache_cookie *cookie) +{ + return smp_load_acquire(&cookie->state); +} /** * fscache_get_key - Get a pointer to the cookie key -- cgit v1.2.3 From 12bb21a29c19aae50cfad4e2bb5c943108f34a7d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 15:53:34 +0100 Subject: fscache: Implement cookie user counting and resource pinning Provide a pair of functions to count the number of users of a cookie (open files, writeback, invalidation, resizing, reads, writes), to obtain and pin resources for the cookie and to prevent culling for the whilst there are users. The first function marks a cookie as being in use: void fscache_use_cookie(struct fscache_cookie *cookie, bool will_modify); The caller should indicate the cookie to use and whether or not the caller is in a context that may modify the cookie (e.g. a file open O_RDWR). If the cookie is not already resourced, fscache will ask the cache backend in the background to do whatever it needs to look up, create or otherwise obtain the resources necessary to access data. This is pinned to the cookie and may not be culled, though it may be withdrawn if the cache as a whole is withdrawn. The second function removes the in-use mark from a cookie and, optionally, updates the coherency data: void fscache_unuse_cookie(struct fscache_cookie *cookie, const void *aux_data, const loff_t *object_size); If non-NULL, the aux_data buffer and/or the object_size will be saved into the cookie and will be set on the backing store when the object is committed. If this removes the last usage on a cookie, the cookie is placed onto an LRU list from which it will be removed and closed after a couple of seconds if it doesn't get reused. This prevents resource overload in the cache - in particular it prevents it from holding too many files open. Changes ======= ver #2: - Fix fscache_unuse_cookie() to use atomic_dec_and_lock() to avoid a potential race if the cookie gets reused before it completes the unusement. - Added missing transition to LRU_DISCARDING state. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819600612.215744.13678350304176542741.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906907567.143852.16979631199380722019.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967106467.1823006.6790864931048582667.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021511674.640689.10084988363699111860.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache.h | 82 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 4450d17c11e8..e6c321e5bf73 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -22,12 +22,14 @@ #define fscache_available() (1) #define fscache_volume_valid(volume) (volume) #define fscache_cookie_valid(cookie) (cookie) -#define fscache_cookie_enabled(cookie) (cookie) +#define fscache_resources_valid(cres) ((cres)->cache_priv) +#define fscache_cookie_enabled(cookie) (cookie && !test_bit(FSCACHE_COOKIE_DISABLED, &cookie->flags)) #else #define __fscache_available (0) #define fscache_available() (0) #define fscache_volume_valid(volume) (0) #define fscache_cookie_valid(cookie) (0) +#define fscache_resources_valid(cres) (false) #define fscache_cookie_enabled(cookie) (0) #endif @@ -46,6 +48,7 @@ enum fscache_cookie_state { FSCACHE_COOKIE_STATE_CREATING, /* The cache object is being created */ FSCACHE_COOKIE_STATE_ACTIVE, /* The cache is active, readable and writable */ FSCACHE_COOKIE_STATE_FAILED, /* The cache failed, withdraw to clear */ + FSCACHE_COOKIE_STATE_LRU_DISCARDING, /* The cookie is being discarded by the LRU */ FSCACHE_COOKIE_STATE_WITHDRAWING, /* The cookie is being withdrawn */ FSCACHE_COOKIE_STATE_RELINQUISHING, /* The cookie is being relinquished */ FSCACHE_COOKIE_STATE_DROPPED, /* The cookie has been dropped */ @@ -147,6 +150,8 @@ extern struct fscache_cookie *__fscache_acquire_cookie( const void *, size_t, const void *, size_t, loff_t); +extern void __fscache_use_cookie(struct fscache_cookie *, bool); +extern void __fscache_unuse_cookie(struct fscache_cookie *, const void *, const loff_t *); extern void __fscache_relinquish_cookie(struct fscache_cookie *, bool); /** @@ -228,6 +233,39 @@ struct fscache_cookie *fscache_acquire_cookie(struct fscache_volume *volume, object_size); } +/** + * fscache_use_cookie - Request usage of cookie attached to an object + * @object: Object description + * @will_modify: If cache is expected to be modified locally + * + * Request usage of the cookie attached to an object. The caller should tell + * the cache if the object's contents are about to be modified locally and then + * the cache can apply the policy that has been set to handle this case. + */ +static inline void fscache_use_cookie(struct fscache_cookie *cookie, + bool will_modify) +{ + if (fscache_cookie_valid(cookie)) + __fscache_use_cookie(cookie, will_modify); +} + +/** + * fscache_unuse_cookie - Cease usage of cookie attached to an object + * @object: Object description + * @aux_data: Updated auxiliary data (or NULL) + * @object_size: Revised size of the object (or NULL) + * + * Cease usage of the cookie attached to an object. When the users count + * reaches zero then the cookie relinquishment will be permitted to proceed. + */ +static inline void fscache_unuse_cookie(struct fscache_cookie *cookie, + const void *aux_data, + const loff_t *object_size) +{ + if (fscache_cookie_valid(cookie)) + __fscache_unuse_cookie(cookie, aux_data, object_size); +} + /** * fscache_relinquish_cookie - Return the cookie to the cache, maybe discarding * it @@ -247,4 +285,46 @@ void fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) __fscache_relinquish_cookie(cookie, retire); } +/* + * Find the auxiliary data on a cookie. + */ +static inline void *fscache_get_aux(struct fscache_cookie *cookie) +{ + if (cookie->aux_len <= sizeof(cookie->inline_aux)) + return cookie->inline_aux; + else + return cookie->aux; +} + +/* + * Update the auxiliary data on a cookie. + */ +static inline +void fscache_update_aux(struct fscache_cookie *cookie, + const void *aux_data, const loff_t *object_size) +{ + void *p = fscache_get_aux(cookie); + + if (aux_data && p) + memcpy(p, aux_data, cookie->aux_len); + if (object_size) + cookie->object_size = *object_size; +} + +#ifdef CONFIG_FSCACHE_STATS +extern atomic_t fscache_n_updates; +#endif + +static inline +void __fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data, + const loff_t *object_size) +{ +#ifdef CONFIG_FSCACHE_STATS + atomic_inc(&fscache_n_updates); +#endif + fscache_update_aux(cookie, aux_data, object_size); + smp_wmb(); + set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &cookie->flags); +} + #endif /* _LINUX_FSCACHE_H */ -- cgit v1.2.3 From d24af13e2e2358a602740c7817ea90da43d3e740 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 15:53:34 +0100 Subject: fscache: Implement cookie invalidation Add a function to invalidate the cache behind a cookie: void fscache_invalidate(struct fscache_cookie *cookie, const void *aux_data, loff_t size, unsigned int flags) This causes any cached data for the specified cookie to be discarded. If the cookie is marked as being in use, a new cache object will be created if possible and future I/O will use that instead. In-flight I/O should be abandoned (writes) or reconsidered (reads). Each time it is called cookie->inval_counter is incremented and this can be used to detect invalidation at the end of an I/O operation. The coherency data attached to the cookie can be updated and the cookie size should be reset. One flag is available, FSCACHE_INVAL_DIO_WRITE, which should be used to indicate invalidation due to a DIO write on a file. This will temporarily disable caching for this cookie. Changes ======= ver #2: - Should only change to inval state if can get access to cache. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819602231.215744.11206598147269491575.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906909707.143852.18056070560477964891.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967107447.1823006.5945029409592119962.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021512640.640689.11418616313147754172.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache-cache.h | 4 ++++ include/linux/fscache.h | 31 +++++++++++++++++++++++++++++++ include/linux/netfs.h | 1 + 3 files changed, 36 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index ae6a75976450..1ad56bfd9d72 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -64,6 +64,9 @@ struct fscache_cache_ops { /* Withdraw an object without any cookie access counts held */ void (*withdraw_cookie)(struct fscache_cookie *cookie); + /* Invalidate an object */ + bool (*invalidate_cookie)(struct fscache_cookie *cookie); + /* Prepare to write to a live cache object */ void (*prepare_to_write)(struct fscache_cookie *cookie); }; @@ -96,6 +99,7 @@ extern void fscache_put_cookie(struct fscache_cookie *cookie, extern void fscache_end_cookie_access(struct fscache_cookie *cookie, enum fscache_access_trace why); extern void fscache_cookie_lookup_negative(struct fscache_cookie *cookie); +extern void fscache_resume_after_invalidation(struct fscache_cookie *cookie); extern void fscache_caching_failed(struct fscache_cookie *cookie); /** diff --git a/include/linux/fscache.h b/include/linux/fscache.h index e6c321e5bf73..0f36d1fac237 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -39,6 +39,8 @@ struct fscache_cookie; #define FSCACHE_ADV_WRITE_CACHE 0x00 /* Do cache if written to locally */ #define FSCACHE_ADV_WRITE_NOCACHE 0x02 /* Don't cache if written to locally */ +#define FSCACHE_INVAL_DIO_WRITE 0x01 /* Invalidate due to DIO write */ + /* * Data object state. */ @@ -47,6 +49,7 @@ enum fscache_cookie_state { FSCACHE_COOKIE_STATE_LOOKING_UP, /* The cache object is being looked up */ FSCACHE_COOKIE_STATE_CREATING, /* The cache object is being created */ FSCACHE_COOKIE_STATE_ACTIVE, /* The cache is active, readable and writable */ + FSCACHE_COOKIE_STATE_INVALIDATING, /* The cache is being invalidated */ FSCACHE_COOKIE_STATE_FAILED, /* The cache failed, withdraw to clear */ FSCACHE_COOKIE_STATE_LRU_DISCARDING, /* The cookie is being discarded by the LRU */ FSCACHE_COOKIE_STATE_WITHDRAWING, /* The cookie is being withdrawn */ @@ -153,6 +156,7 @@ extern struct fscache_cookie *__fscache_acquire_cookie( extern void __fscache_use_cookie(struct fscache_cookie *, bool); extern void __fscache_unuse_cookie(struct fscache_cookie *, const void *, const loff_t *); extern void __fscache_relinquish_cookie(struct fscache_cookie *, bool); +extern void __fscache_invalidate(struct fscache_cookie *, const void *, loff_t, unsigned int); /** * fscache_acquire_volume - Register a volume as desiring caching services @@ -327,4 +331,31 @@ void __fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &cookie->flags); } +/** + * fscache_invalidate - Notify cache that an object needs invalidation + * @cookie: The cookie representing the cache object + * @aux_data: The updated auxiliary data for the cookie (may be NULL) + * @size: The revised size of the object. + * @flags: Invalidation flags (FSCACHE_INVAL_*) + * + * Notify the cache that an object is needs to be invalidated and that it + * should abort any retrievals or stores it is doing on the cache. This + * increments inval_counter on the cookie which can be used by the caller to + * reconsider I/O requests as they complete. + * + * If @flags has FSCACHE_INVAL_DIO_WRITE set, this indicates that this is due + * to a direct I/O write and will cause caching to be disabled on this cookie + * until it is completely unused. + * + * See Documentation/filesystems/caching/netfs-api.rst for a complete + * description. + */ +static inline +void fscache_invalidate(struct fscache_cookie *cookie, + const void *aux_data, loff_t size, unsigned int flags) +{ + if (fscache_cookie_enabled(cookie)) + __fscache_invalidate(cookie, aux_data, size, flags); +} + #endif /* _LINUX_FSCACHE_H */ diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 1ea22fc48818..5a46fde65759 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -124,6 +124,7 @@ struct netfs_cache_resources { void *cache_priv; void *cache_priv2; unsigned int debug_id; /* Cookie debug ID */ + unsigned int inval_counter; /* object->inval_counter at begin_op */ }; /* -- cgit v1.2.3 From d64f4554dd177c5891c02424a8d9e80590b55b35 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 14:06:34 +0100 Subject: fscache: Provide a means to begin an operation Provide a function to begin a read operation: int fscache_begin_read_operation( struct netfs_cache_resources *cres, struct fscache_cookie *cookie) This is primarily intended to be called by network filesystems on behalf of netfslib, but may also be called to use the I/O access functions directly. It attaches the resources required by the cache to cres struct from the supplied cookie. This holds access to the cache behind the cookie for the duration of the operation and forces cache withdrawal and cookie invalidation to perform synchronisation on the operation. cres->inval_counter is set from the cookie at this point so that it can be compared at the end of the operation. Note that this does not guarantee that the cache state is fully set up and able to perform I/O immediately; looking up and creation may be left in progress in the background. The operations intended to be called by the network filesystem, such as reading and writing, are expected to wait for the cookie to move to the correct state. This will, however, potentially sleep, waiting for a certain minimum state to be set or for operations such as invalidate to advance far enough that I/O can resume. Also provide a function for the cache to call to wait for the cache object to get to a state where it can be used for certain things: bool fscache_wait_for_operation(struct netfs_cache_resources *cres, enum fscache_want_stage stage); This looks at the cache resources provided by the begin function and waits for them to get to an appropriate stage. There's a choice of wanting just some parameters (FSCACHE_WANT_PARAM) or the ability to do I/O (FSCACHE_WANT_READ or FSCACHE_WANT_WRITE). Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819603692.215744.146724961588817028.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906910672.143852.13856103384424986357.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967110245.1823006.2239170567540431836.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021513617.640689.16627329360866150606.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache-cache.h | 11 ++++++++++ include/linux/fscache.h | 49 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 1ad56bfd9d72..566497cf5f13 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -67,6 +67,10 @@ struct fscache_cache_ops { /* Invalidate an object */ bool (*invalidate_cookie)(struct fscache_cookie *cookie); + /* Begin an operation for the netfs lib */ + bool (*begin_operation)(struct netfs_cache_resources *cres, + enum fscache_want_state want_state); + /* Prepare to write to a live cache object */ void (*prepare_to_write)(struct fscache_cookie *cookie); }; @@ -101,6 +105,8 @@ extern void fscache_end_cookie_access(struct fscache_cookie *cookie, extern void fscache_cookie_lookup_negative(struct fscache_cookie *cookie); extern void fscache_resume_after_invalidation(struct fscache_cookie *cookie); extern void fscache_caching_failed(struct fscache_cookie *cookie); +extern bool fscache_wait_for_operation(struct netfs_cache_resources *cred, + enum fscache_want_state state); /** * fscache_cookie_state - Read the state of a cookie @@ -129,4 +135,9 @@ static inline void *fscache_get_key(struct fscache_cookie *cookie) return cookie->key; } +static inline struct fscache_cookie *fscache_cres_cookie(struct netfs_cache_resources *cres) +{ + return cres->cache_priv; +} + #endif /* _LINUX_FSCACHE_CACHE_H */ diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 0f36d1fac237..7cdc63c4fe35 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -41,6 +41,12 @@ struct fscache_cookie; #define FSCACHE_INVAL_DIO_WRITE 0x01 /* Invalidate due to DIO write */ +enum fscache_want_state { + FSCACHE_WANT_PARAMS, + FSCACHE_WANT_WRITE, + FSCACHE_WANT_READ, +}; + /* * Data object state. */ @@ -157,6 +163,7 @@ extern void __fscache_use_cookie(struct fscache_cookie *, bool); extern void __fscache_unuse_cookie(struct fscache_cookie *, const void *, const loff_t *); extern void __fscache_relinquish_cookie(struct fscache_cookie *, bool); extern void __fscache_invalidate(struct fscache_cookie *, const void *, loff_t, unsigned int); +extern int __fscache_begin_read_operation(struct netfs_cache_resources *, struct fscache_cookie *); /** * fscache_acquire_volume - Register a volume as desiring caching services @@ -358,4 +365,46 @@ void fscache_invalidate(struct fscache_cookie *cookie, __fscache_invalidate(cookie, aux_data, size, flags); } +/** + * fscache_operation_valid - Return true if operations resources are usable + * @cres: The resources to check. + * + * Returns a pointer to the operations table if usable or NULL if not. + */ +static inline +const struct netfs_cache_ops *fscache_operation_valid(const struct netfs_cache_resources *cres) +{ + return fscache_resources_valid(cres) ? cres->ops : NULL; +} + +/** + * fscache_begin_read_operation - Begin a read operation for the netfs lib + * @cres: The cache resources for the read being performed + * @cookie: The cookie representing the cache object + * + * Begin a read operation on behalf of the netfs helper library. @cres + * indicates the cache resources to which the operation state should be + * attached; @cookie indicates the cache object that will be accessed. + * + * This is intended to be called from the ->begin_cache_operation() netfs lib + * operation as implemented by the network filesystem. + * + * @cres->inval_counter is set from @cookie->inval_counter for comparison at + * the end of the operation. This allows invalidation during the operation to + * be detected by the caller. + * + * Returns: + * * 0 - Success + * * -ENOBUFS - No caching available + * * Other error code from the cache, such as -ENOMEM. + */ +static inline +int fscache_begin_read_operation(struct netfs_cache_resources *cres, + struct fscache_cookie *cookie) +{ + if (fscache_cookie_enabled(cookie)) + return __fscache_begin_read_operation(cres, cookie); + return -ENOBUFS; +} + #endif /* _LINUX_FSCACHE_H */ -- cgit v1.2.3 From cdf262f29488e6c3432911ec487ea41918fcbcd7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 11 Nov 2021 23:14:29 +0000 Subject: fscache: Count data storage objects in a cache Count the data storage objects that are currently allocated in a cache. This is used to pin certain cache structures until cache withdrawal is complete. Three helpers are provided to manage and make use of the count: (1) void fscache_count_object(struct fscache_cache *cache); This should be called by the cache backend to note that an object has been allocated and attached to the cache. (2) void fscache_uncount_object(struct fscache_cache *cache); This should be called by the backend to note that an object has been destroyed. This sends a wakeup event that allows cache withdrawal to proceed if it was waiting for that object. (3) void fscache_wait_for_objects(struct fscache_cache *cache); This can be used by the backend to wait for all outstanding cache object to be destroyed. Each cache's counter is displayed as part of /proc/fs/fscache/caches. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819608594.215744.1812706538117388252.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906911646.143852.168184059935530127.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967111846.1823006.9868154941573671255.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021516219.640689.4934796654308958158.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache-cache.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 566497cf5f13..337335d7a5e2 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -76,6 +76,7 @@ struct fscache_cache_ops { }; extern struct workqueue_struct *fscache_wq; +extern wait_queue_head_t fscache_clearance_waiters; /* * out-of-line cache backend functions @@ -140,4 +141,42 @@ static inline struct fscache_cookie *fscache_cres_cookie(struct netfs_cache_reso return cres->cache_priv; } +/** + * fscache_count_object - Tell fscache that an object has been added + * @cache: The cache to account to + * + * Tell fscache that an object has been added to the cache. This prevents the + * cache from tearing down the cache structure until the object is uncounted. + */ +static inline void fscache_count_object(struct fscache_cache *cache) +{ + atomic_inc(&cache->object_count); +} + +/** + * fscache_uncount_object - Tell fscache that an object has been removed + * @cache: The cache to account to + * + * Tell fscache that an object has been removed from the cache and will no + * longer be accessed. After this point, the cache cookie may be destroyed. + */ +static inline void fscache_uncount_object(struct fscache_cache *cache) +{ + if (atomic_dec_and_test(&cache->object_count)) + wake_up_all(&fscache_clearance_waiters); +} + +/** + * fscache_wait_for_objects - Wait for all objects to be withdrawn + * @cache: The cache to query + * + * Wait for all extant objects in a cache to finish being withdrawn + * and go away. + */ +static inline void fscache_wait_for_objects(struct fscache_cache *cache) +{ + wait_event(fscache_clearance_waiters, + atomic_read(&cache->object_count) == 0); +} + #endif /* _LINUX_FSCACHE_CACHE_H */ -- cgit v1.2.3 From 8e7a867bb7309fbf47e8c2a68798b919fc02523f Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 23:06:16 +0100 Subject: fscache: Provide read/write stat counters for the cache Provide read/write stat counters for the cache backend to use. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819609532.215744.10821082637727410554.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906912598.143852.12960327989649429069.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967113830.1823006.3222957649202368162.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021517502.640689.6077928311710357342.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache-cache.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 337335d7a5e2..796c8b5c5305 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -179,4 +179,14 @@ static inline void fscache_wait_for_objects(struct fscache_cache *cache) atomic_read(&cache->object_count) == 0); } +#ifdef CONFIG_FSCACHE_STATS +extern atomic_t fscache_n_read; +extern atomic_t fscache_n_write; +#define fscache_count_read() atomic_inc(&fscache_n_read) +#define fscache_count_write() atomic_inc(&fscache_n_write) +#else +#define fscache_count_read() do {} while(0) +#define fscache_count_write() do {} while(0) +#endif + #endif /* _LINUX_FSCACHE_CACHE_H */ -- cgit v1.2.3 From ed1235eb78a7421cd0ac2ad09e931f8f07ccdc7c Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 23:10:46 +0100 Subject: fscache: Provide a function to let the netfs update its coherency data Provide a function to let the netfs update its coherency data: void fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data, const loff_t *object_size); This will update the auxiliary data and/or the size of the object attached to a cookie if either pointer is not-NULL and flag that the disk needs to be updated. Note that fscache_unuse_cookie() also allows this to be done. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819610438.215744.4223265964131424954.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906913530.143852.18150303220217653820.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967117795.1823006.7493373142653442595.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021518440.640689.6369952464473039268.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 7cdc63c4fe35..fc77648c8af6 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -338,6 +338,28 @@ void __fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &cookie->flags); } +/** + * fscache_update_cookie - Request that a cache object be updated + * @cookie: The cookie representing the cache object + * @aux_data: The updated auxiliary data for the cookie (may be NULL) + * @object_size: The current size of the object (may be NULL) + * + * Request an update of the index data for the cache object associated with the + * cookie. The auxiliary data on the cookie will be updated first if @aux_data + * is set and the object size will be updated and the object possibly trimmed + * if @object_size is set. + * + * See Documentation/filesystems/caching/netfs-api.rst for a complete + * description. + */ +static inline +void fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data, + const loff_t *object_size) +{ + if (fscache_cookie_enabled(cookie)) + __fscache_update_cookie(cookie, aux_data, object_size); +} + /** * fscache_invalidate - Notify cache that an object needs invalidation * @cookie: The cookie representing the cache object -- cgit v1.2.3 From 3a11b3a86366ccbf0818b088ffecadf8b2d61177 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Sep 2021 09:47:45 +0100 Subject: netfs: Pass more information on how to deal with a hole in the cache Pass more information to the cache on how to deal with a hole if it encounters one when trying to read from the cache. Three options are provided: (1) NETFS_READ_HOLE_IGNORE. Read the hole along with the data, assuming it to be a punched-out extent by the backing filesystem. (2) NETFS_READ_HOLE_CLEAR. If there's a hole, erase the requested region of the cache and clear the read buffer. (3) NETFS_READ_HOLE_FAIL. Fail the read if a hole is detected. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819612321.215744.9738308885948264476.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906914460.143852.6284247083607910189.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967119923.1823006.15637375885194297582.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021519762.640689.16994364383313159319.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/netfs.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5a46fde65759..b46c39d98bbd 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -196,6 +196,15 @@ struct netfs_read_request_ops { void (*cleanup)(struct address_space *mapping, void *netfs_priv); }; +/* + * How to handle reading from a hole. + */ +enum netfs_read_from_hole { + NETFS_READ_HOLE_IGNORE, + NETFS_READ_HOLE_CLEAR, + NETFS_READ_HOLE_FAIL, +}; + /* * Table of operations for access to a cache. This is obtained by * rreq->ops->begin_cache_operation(). @@ -208,7 +217,7 @@ struct netfs_cache_ops { int (*read)(struct netfs_cache_resources *cres, loff_t start_pos, struct iov_iter *iter, - bool seek_data, + enum netfs_read_from_hole read_hole, netfs_io_terminated_t term_func, void *term_func_priv); -- cgit v1.2.3 From 9af1c6c3089b294ffa240e0fbba356666698b6d0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 14:06:34 +0100 Subject: fscache: Implement raw I/O interface Provide a pair of functions to perform raw I/O on the cache. The first function allows an arbitrary asynchronous direct-IO read to be made against a cache object, though the read should be aligned and sized appropriately for the backing device: int fscache_read(struct netfs_cache_resources *cres, loff_t start_pos, struct iov_iter *iter, enum netfs_read_from_hole read_hole, netfs_io_terminated_t term_func, void *term_func_priv); The cache resources must have been previously initialised by fscache_begin_read_operation(). A read operation is sent to the backing filesystem, starting at start_pos within the file. The size of the read is specified by the iterator, as is the location of the output buffer. If there is a hole in the data it can be ignored and left to the backing filesystem to deal with (NETFS_READ_HOLE_IGNORE), a hole at the beginning can be skipped over and the buffer padded with zeros (NETFS_READ_HOLE_CLEAR) or -ENODATA can be given (NETFS_READ_HOLE_FAIL). If term_func is not NULL, the operation may be performed asynchronously. Upon completion, successful or otherwise, (*term_func)() will be called and passed term_func_priv, along with an error or the amount of data transferred. If the op is run asynchronously, fscache_read() will return -EIOCBQUEUED. The second function allows an arbitrary asynchronous direct-IO write to be made against a cache object, though the write should be aligned and sized appropriately for the backing device: int fscache_write(struct netfs_cache_resources *cres, loff_t start_pos, struct iov_iter *iter, netfs_io_terminated_t term_func, void *term_func_priv); This works in very similar way to fscache_read(), except that there's no need to deal with holes (they're just overwritten). The caller is responsible for preventing concurrent overlapping writes. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819613224.215744.7877577215582621254.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906915386.143852.16936177636106480724.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967122632.1823006.7487049517698562172.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021521420.640689.12747258780542678309.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache.h | 74 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index fc77648c8af6..ae753cae0fdd 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -429,4 +429,78 @@ int fscache_begin_read_operation(struct netfs_cache_resources *cres, return -ENOBUFS; } +/** + * fscache_read - Start a read from the cache. + * @cres: The cache resources to use + * @start_pos: The beginning file offset in the cache file + * @iter: The buffer to fill - and also the length + * @read_hole: How to handle a hole in the data. + * @term_func: The function to call upon completion + * @term_func_priv: The private data for @term_func + * + * Start a read from the cache. @cres indicates the cache object to read from + * and must be obtained by a call to fscache_begin_operation() beforehand. + * + * The data is read into the iterator, @iter, and that also indicates the size + * of the operation. @start_pos is the start position in the file, though if + * @seek_data is set appropriately, the cache can use SEEK_DATA to find the + * next piece of data, writing zeros for the hole into the iterator. + * + * Upon termination of the operation, @term_func will be called and supplied + * with @term_func_priv plus the amount of data written, if successful, or the + * error code otherwise. + * + * @read_hole indicates how a partially populated region in the cache should be + * handled. It can be one of a number of settings: + * + * NETFS_READ_HOLE_IGNORE - Just try to read (may return a short read). + * + * NETFS_READ_HOLE_CLEAR - Seek for data, clearing the part of the buffer + * skipped over, then do as for IGNORE. + * + * NETFS_READ_HOLE_FAIL - Give ENODATA if we encounter a hole. + */ +static inline +int fscache_read(struct netfs_cache_resources *cres, + loff_t start_pos, + struct iov_iter *iter, + enum netfs_read_from_hole read_hole, + netfs_io_terminated_t term_func, + void *term_func_priv) +{ + const struct netfs_cache_ops *ops = fscache_operation_valid(cres); + return ops->read(cres, start_pos, iter, read_hole, + term_func, term_func_priv); +} + +/** + * fscache_write - Start a write to the cache. + * @cres: The cache resources to use + * @start_pos: The beginning file offset in the cache file + * @iter: The data to write - and also the length + * @term_func: The function to call upon completion + * @term_func_priv: The private data for @term_func + * + * Start a write to the cache. @cres indicates the cache object to write to and + * must be obtained by a call to fscache_begin_operation() beforehand. + * + * The data to be written is obtained from the iterator, @iter, and that also + * indicates the size of the operation. @start_pos is the start position in + * the file. + * + * Upon termination of the operation, @term_func will be called and supplied + * with @term_func_priv plus the amount of data written, if successful, or the + * error code otherwise. + */ +static inline +int fscache_write(struct netfs_cache_resources *cres, + loff_t start_pos, + struct iov_iter *iter, + netfs_io_terminated_t term_func, + void *term_func_priv) +{ + const struct netfs_cache_ops *ops = fscache_operation_valid(cres); + return ops->write(cres, start_pos, iter, term_func, term_func_priv); +} + #endif /* _LINUX_FSCACHE_H */ -- cgit v1.2.3 From b6e16652d6c0e4f9e9b120f66966ec153f0623fc Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 14:06:34 +0100 Subject: fscache: Implement higher-level write I/O interface Provide a higher-level function than fscache_write() to perform a write from an inode's pagecache to the cache, whilst fending off concurrent writes by means of the PG_fscache mark on a page: void fscache_write_to_cache(struct fscache_cookie *cookie, struct address_space *mapping, loff_t start, size_t len, loff_t i_size, netfs_io_terminated_t term_func, void *term_func_priv, bool caching); If caching is false, this function does nothing except call (*term_func)() if given. It assumes that, in such a case, PG_fscache will not have been set on the pages. Otherwise, if caching is true, this function requires the source pages to have had PG_fscache set on them before calling. start and len define the region of the file to be modified and i_size indicates the new file size. The source pages are extracted from the mapping. term_func and term_func_priv work as for fscache_write(). The PG_fscache marks will be cleared at the end of the operation, before term_func is called or the function otherwise returns. There is an additonal helper function to clear the PG_fscache bits from a range of pages: void fscache_clear_page_bits(struct fscache_cookie *cookie, struct address_space *mapping, loff_t start, size_t len, bool caching); If caching is true, the pages to be managed are expected to be located on mapping in the range defined by start and len. If caching is false, it does nothing. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819614155.215744.5528123235123721230.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906916346.143852.15632773570362489926.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967123599.1823006.12946816026724657428.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021522672.640689.4381958316198807813.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index ae753cae0fdd..9d469613e16c 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -165,6 +165,11 @@ extern void __fscache_relinquish_cookie(struct fscache_cookie *, bool); extern void __fscache_invalidate(struct fscache_cookie *, const void *, loff_t, unsigned int); extern int __fscache_begin_read_operation(struct netfs_cache_resources *, struct fscache_cookie *); +extern void __fscache_write_to_cache(struct fscache_cookie *, struct address_space *, + loff_t, size_t, loff_t, netfs_io_terminated_t, void *, + bool); +extern void __fscache_clear_page_bits(struct address_space *, loff_t, size_t); + /** * fscache_acquire_volume - Register a volume as desiring caching services * @volume_key: An identification string for the volume @@ -503,4 +508,62 @@ int fscache_write(struct netfs_cache_resources *cres, return ops->write(cres, start_pos, iter, term_func, term_func_priv); } +/** + * fscache_clear_page_bits - Clear the PG_fscache bits from a set of pages + * @cookie: The cookie representing the cache object + * @mapping: The netfs inode to use as the source + * @start: The start position in @mapping + * @len: The amount of data to unlock + * @caching: If PG_fscache has been set + * + * Clear the PG_fscache flag from a sequence of pages and wake up anyone who's + * waiting. + */ +static inline void fscache_clear_page_bits(struct fscache_cookie *cookie, + struct address_space *mapping, + loff_t start, size_t len, + bool caching) +{ + if (caching) + __fscache_clear_page_bits(mapping, start, len); +} + +/** + * fscache_write_to_cache - Save a write to the cache and clear PG_fscache + * @cookie: The cookie representing the cache object + * @mapping: The netfs inode to use as the source + * @start: The start position in @mapping + * @len: The amount of data to write back + * @i_size: The new size of the inode + * @term_func: The function to call upon completion + * @term_func_priv: The private data for @term_func + * @caching: If PG_fscache has been set + * + * Helper function for a netfs to write dirty data from an inode into the cache + * object that's backing it. + * + * @start and @len describe the range of the data. This does not need to be + * page-aligned, but to satisfy DIO requirements, the cache may expand it up to + * the page boundaries on either end. All the pages covering the range must be + * marked with PG_fscache. + * + * If given, @term_func will be called upon completion and supplied with + * @term_func_priv. Note that the PG_fscache flags will have been cleared by + * this point, so the netfs must retain its own pin on the mapping. + */ +static inline void fscache_write_to_cache(struct fscache_cookie *cookie, + struct address_space *mapping, + loff_t start, size_t len, loff_t i_size, + netfs_io_terminated_t term_func, + void *term_func_priv, + bool caching) +{ + if (caching) + __fscache_write_to_cache(cookie, mapping, start, len, i_size, + term_func, term_func_priv, caching); + else if (term_func) + term_func(term_func_priv, -ENOBUFS, false); + +} + #endif /* _LINUX_FSCACHE_H */ -- cgit v1.2.3 From 08276bdae68b022a7726edf7416b6748e3df5395 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 23:50:01 +0100 Subject: vfs, fscache: Implement pinning of cache usage for writeback Cachefiles has a problem in that it needs to keep the backing file for a cookie open whilst there are local modifications pending that need to be written to it. However, we don't want to keep the file open indefinitely, as that causes EMFILE/ENFILE/ENOMEM problems. Reopening the cache file, however, is a problem if this is being done due to writeback triggered by exit(). Some filesystems will oops if we try to open a file in that context because they want to access current->fs or other resources that have already been dismantled. To get around this, I added the following: (1) An inode flag, I_PINNING_FSCACHE_WB, to be set on a network filesystem inode to indicate that we have a usage count on the cookie caching that inode. (2) A flag in struct writeback_control, unpinned_fscache_wb, that is set when __writeback_single_inode() clears the last dirty page from i_pages - at which point it clears I_PINNING_FSCACHE_WB and sets this flag. This has to be done here so that clearing I_PINNING_FSCACHE_WB can be done atomically with the check of PAGECACHE_TAG_DIRTY that clears I_DIRTY_PAGES. (3) A function, fscache_set_page_dirty(), which if it is not set, sets I_PINNING_FSCACHE_WB and calls fscache_use_cookie() to pin the cache resources. (4) A function, fscache_unpin_writeback(), to be called by ->write_inode() to unuse the cookie. (5) A function, fscache_clear_inode_writeback(), to be called when the inode is evicted, before clear_inode() is called. This cleans up any lingering I_PINNING_FSCACHE_WB. The network filesystem can then use these tools to make sure that fscache_write_to_cache() can write locally modified data to the cache as well as to the server. For the future, I'm working on write helpers for netfs lib that should allow this facility to be removed by keeping track of the dirty regions separately - but that's incomplete at the moment and is also going to be affected by folios, one way or another, since it deals with pages Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819615157.215744.17623791756928043114.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906917856.143852.8224898306177154573.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967124567.1823006.14188359004568060298.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021524705.640689.17824932021727663017.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fs.h | 3 +++ include/linux/fscache.h | 41 +++++++++++++++++++++++++++++++++++++++++ include/linux/writeback.h | 1 + 3 files changed, 45 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index bbf812ce89a8..2c0b8e77d9ab 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2418,6 +2418,8 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, * Used to detect that mark_inode_dirty() should not move * inode between dirty lists. * + * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -2440,6 +2442,7 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, #define I_CREATING (1 << 15) #define I_DONTCACHE (1 << 16) #define I_SYNC_QUEUED (1 << 17) +#define I_PINNING_FSCACHE_WB (1 << 18) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 9d469613e16c..18e725671594 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -16,6 +16,7 @@ #include #include +#include #if defined(CONFIG_FSCACHE) || defined(CONFIG_FSCACHE_MODULE) #define __fscache_available (1) @@ -566,4 +567,44 @@ static inline void fscache_write_to_cache(struct fscache_cookie *cookie, } +#if __fscache_available +extern int fscache_set_page_dirty(struct page *page, struct fscache_cookie *cookie); +#else +#define fscache_set_page_dirty(PAGE, COOKIE) (__set_page_dirty_nobuffers((PAGE))) +#endif + +/** + * fscache_unpin_writeback - Unpin writeback resources + * @wbc: The writeback control + * @cookie: The cookie referring to the cache object + * + * Unpin the writeback resources pinned by fscache_set_page_dirty(). This is + * intended to be called by the netfs's ->write_inode() method. + */ +static inline void fscache_unpin_writeback(struct writeback_control *wbc, + struct fscache_cookie *cookie) +{ + if (wbc->unpinned_fscache_wb) + fscache_unuse_cookie(cookie, NULL, NULL); +} + +/** + * fscache_clear_inode_writeback - Clear writeback resources pinned by an inode + * @cookie: The cookie referring to the cache object + * @inode: The inode to clean up + * @aux: Auxiliary data to apply to the inode + * + * Clear any writeback resources held by an inode when the inode is evicted. + * This must be called before clear_inode() is called. + */ +static inline void fscache_clear_inode_writeback(struct fscache_cookie *cookie, + struct inode *inode, + const void *aux) +{ + if (inode->i_state & I_PINNING_FSCACHE_WB) { + loff_t i_size = i_size_read(inode); + fscache_unuse_cookie(cookie, aux, &i_size); + } +} + #endif /* _LINUX_FSCACHE_H */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3bfd487d1dd2..fec248ab1fec 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -68,6 +68,7 @@ struct writeback_control { unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ + unsigned unpinned_fscache_wb:1; /* Cleared I_PINNING_FSCACHE_WB */ /* * When writeback IOs are bounced through async layers, only the -- cgit v1.2.3 From 1f67e6d0b18853c641d861a671f46a4964a88510 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 14:06:34 +0100 Subject: fscache: Provide a function to note the release of a page Provide a function to be called from a network filesystem's releasepage method to indicate that a page has been released that might have been a reflection of data upon the server - and now that data must be reloaded from the server or the cache. This is used to end an optimisation for empty files, in particular files that have just been created locally, whereby we know there cannot yet be any data that we would need to read from the server or the cache. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819617128.215744.4725572296135656508.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906920354.143852.7511819614661372008.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967128061.1823006.611781655060034988.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021525963.640689.9264556596205140044.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 18e725671594..28ce258c1f87 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -607,4 +607,20 @@ static inline void fscache_clear_inode_writeback(struct fscache_cookie *cookie, } } +/** + * fscache_note_page_release - Note that a netfs page got released + * @cookie: The cookie corresponding to the file + * + * Note that a page that has been copied to the cache has been released. This + * means that future reads will need to look in the cache to see if it's there. + */ +static inline +void fscache_note_page_release(struct fscache_cookie *cookie) +{ + if (cookie && + test_bit(FSCACHE_COOKIE_HAVE_DATA, &cookie->flags) && + test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) + clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); +} + #endif /* _LINUX_FSCACHE_H */ -- cgit v1.2.3 From 16a96bdf92d5af06f9fa6a01a4b08e2fdfed2e5b Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Oct 2021 14:06:34 +0100 Subject: fscache: Provide a function to resize a cookie Provide a function to change the size of the storage attached to a cookie, to match the size of the file being cached when it's changed by truncate or fallocate: void fscache_resize_cookie(struct fscache_cookie *cookie, loff_t new_size); This acts synchronously and is expected to run under the inode lock of the caller. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819621839.215744.7895597119803515402.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906922387.143852.16394459879816147793.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967128998.1823006.10740669081985775576.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021527861.640689.3466382085497236267.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache-cache.h | 4 ++++ include/linux/fscache.h | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 796c8b5c5305..3fa4902dc87c 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -64,6 +64,10 @@ struct fscache_cache_ops { /* Withdraw an object without any cookie access counts held */ void (*withdraw_cookie)(struct fscache_cookie *cookie); + /* Change the size of a data object */ + void (*resize_cookie)(struct netfs_cache_resources *cres, + loff_t new_size); + /* Invalidate an object */ bool (*invalidate_cookie)(struct fscache_cookie *cookie); diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 28ce258c1f87..86b1c0db1de5 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -163,6 +163,7 @@ extern struct fscache_cookie *__fscache_acquire_cookie( extern void __fscache_use_cookie(struct fscache_cookie *, bool); extern void __fscache_unuse_cookie(struct fscache_cookie *, const void *, const loff_t *); extern void __fscache_relinquish_cookie(struct fscache_cookie *, bool); +extern void __fscache_resize_cookie(struct fscache_cookie *, loff_t); extern void __fscache_invalidate(struct fscache_cookie *, const void *, loff_t, unsigned int); extern int __fscache_begin_read_operation(struct netfs_cache_resources *, struct fscache_cookie *); @@ -366,6 +367,23 @@ void fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data, __fscache_update_cookie(cookie, aux_data, object_size); } +/** + * fscache_resize_cookie - Request that a cache object be resized + * @cookie: The cookie representing the cache object + * @new_size: The new size of the object (may be NULL) + * + * Request that the size of an object be changed. + * + * See Documentation/filesystems/caching/netfs-api.txt for a complete + * description. + */ +static inline +void fscache_resize_cookie(struct fscache_cookie *cookie, loff_t new_size) +{ + if (fscache_cookie_enabled(cookie)) + __fscache_resize_cookie(cookie, new_size); +} + /** * fscache_invalidate - Notify cache that an object needs invalidation * @cookie: The cookie representing the cache object -- cgit v1.2.3 From 1bd9c4e4f0494915b2391f373d25096579f835ff Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 18 Nov 2021 08:58:08 +0000 Subject: vfs, cachefiles: Mark a backing file in use with an inode flag Use an inode flag, S_KERNEL_FILE, to mark that a backing file is in use by the kernel to prevent cachefiles or other kernel services from interfering with that file. Alter rmdir to reject attempts to remove a directory marked with this flag. This is used by cachefiles to prevent cachefilesd from removing them. Using S_SWAPFILE instead isn't really viable as that has other effects in the I/O paths. Changes ======= ver #3: - Check for the object pointer being NULL in the tracepoints rather than the caller. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819630256.215744.4815885535039369574.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906931596.143852.8642051223094013028.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967141000.1823006.12920680657559677789.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021541207.640689.564689725898537127.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 2c0b8e77d9ab..bcf1ca430139 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2249,6 +2249,7 @@ struct super_operations { #define S_ENCRYPTED (1 << 14) /* Encrypted file (using fs/crypto/) */ #define S_CASEFOLD (1 << 15) /* Casefolded file */ #define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */ +#define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */ /* * Note that nosuid etc flags are inode-specific: setting some file-system -- cgit v1.2.3 From 32e150037dce368d129996ffe5f98217b1974d9e Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Dec 2021 09:51:43 +0000 Subject: fscache, cachefiles: Store the volume coherency data Store the volume coherency data in an xattr and check it when we rebind the volume. If it doesn't match the cache volume is moved to the graveyard and rebuilt anew. Changes ======= ver #4: - Remove a couple of debugging prints. Signed-off-by: David Howells Reviewed-by: Jeff Layton Link: https://lore.kernel.org/r/163967164397.1823006.2950539849831291830.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021563138.640689.15851092065380543119.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 86b1c0db1de5..7bd35f60d19a 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -87,6 +87,8 @@ struct fscache_volume { #define FSCACHE_VOLUME_COLLIDED_WITH 2 /* Volume was collided with */ #define FSCACHE_VOLUME_ACQUIRE_PENDING 3 /* Volume is waiting to complete acquisition */ #define FSCACHE_VOLUME_CREATING 4 /* Volume is being created on disk */ + u8 coherency_len; /* Length of the coherency data */ + u8 coherency[]; /* Coherency data */ }; /* -- cgit v1.2.3 From 3929eca769b5a231010b4978acc61c0735da198f Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 21 Oct 2021 21:58:29 +0100 Subject: fscache, cachefiles: Display stats of no-space events Add stat counters of no-space events that caused caching not to happen and display in /proc/fs/fscache/stats. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819653216.215744.17210522251617386509.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906958369.143852.7257100711818401748.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967166917.1823006.14842444049198947892.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021566184.640689.4417328329632709265.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache-cache.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 3fa4902dc87c..007e47f38610 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -186,11 +186,17 @@ static inline void fscache_wait_for_objects(struct fscache_cache *cache) #ifdef CONFIG_FSCACHE_STATS extern atomic_t fscache_n_read; extern atomic_t fscache_n_write; +extern atomic_t fscache_n_no_write_space; +extern atomic_t fscache_n_no_create_space; #define fscache_count_read() atomic_inc(&fscache_n_read) #define fscache_count_write() atomic_inc(&fscache_n_write) +#define fscache_count_no_write_space() atomic_inc(&fscache_n_no_write_space) +#define fscache_count_no_create_space() atomic_inc(&fscache_n_no_create_space) #else #define fscache_count_read() do {} while(0) #define fscache_count_write() do {} while(0) +#define fscache_count_no_write_space() do {} while(0) +#define fscache_count_no_create_space() do {} while(0) #endif #endif /* _LINUX_FSCACHE_CACHE_H */ -- cgit v1.2.3 From 9f08ebc3438baaaefcc79654b330209b83397f17 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 22 Oct 2021 09:17:58 +0100 Subject: fscache, cachefiles: Display stat of culling events Add a stat counter of culling events whereby the cache backend culls a file to make space (when asked by cachefilesd in this case) and display in /proc/fs/fscache/stats. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/163819654165.215744.3797804661644212436.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906961387.143852.9291157239960289090.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967168266.1823006.14436200166581605746.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021567619.640689.4339228906248763197.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache-cache.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 007e47f38610..a174cedf4d90 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -188,15 +188,18 @@ extern atomic_t fscache_n_read; extern atomic_t fscache_n_write; extern atomic_t fscache_n_no_write_space; extern atomic_t fscache_n_no_create_space; +extern atomic_t fscache_n_culled; #define fscache_count_read() atomic_inc(&fscache_n_read) #define fscache_count_write() atomic_inc(&fscache_n_write) #define fscache_count_no_write_space() atomic_inc(&fscache_n_no_write_space) #define fscache_count_no_create_space() atomic_inc(&fscache_n_no_create_space) +#define fscache_count_culled() atomic_inc(&fscache_n_culled) #else #define fscache_count_read() do {} while(0) #define fscache_count_write() do {} while(0) #define fscache_count_no_write_space() do {} while(0) #define fscache_count_no_create_space() do {} while(0) +#define fscache_count_culled() do {} while(0) #endif #endif /* _LINUX_FSCACHE_CACHE_H */ -- cgit v1.2.3 From 2efd61a608b0039911924d2e5d7028eb37496e85 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 10 Dec 2021 16:36:20 +0000 Subject: KVM: Warn if mark_page_dirty() is called without an active vCPU The various kvm_write_guest() and mark_page_dirty() functions must only ever be called in the context of an active vCPU, because if dirty ring tracking is enabled it may simply oops when kvm_get_running_vcpu() returns NULL for the vcpu and then kvm_dirty_ring_get() dereferences it. This oops was reported by "butt3rflyh4ck" in https://lore.kernel.org/kvm/CAFcO6XOmoS7EacN_n6v4Txk7xL7iqRa2gABg3F7E3Naf5uG94g@mail.gmail.com/ That actual bug will be fixed under separate cover but this warning should help to prevent new ones from being added. Signed-off-by: David Woodhouse Message-Id: <20211210163625.2886-2-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- include/linux/kvm_dirty_ring.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h index 4da8d4a4140b..906f899813dc 100644 --- a/include/linux/kvm_dirty_ring.h +++ b/include/linux/kvm_dirty_ring.h @@ -43,11 +43,6 @@ static inline int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, return 0; } -static inline struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm) -{ - return NULL; -} - static inline int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring) { @@ -78,7 +73,6 @@ static inline bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring) u32 kvm_dirty_ring_get_rsvd_entries(void); int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size); -struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm); /* * called with kvm->slots_lock held, returns the number of -- cgit v1.2.3 From 982ed0de4753ed6e71dbd40f82a5a066baf133ed Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 10 Dec 2021 16:36:21 +0000 Subject: KVM: Reinstate gfn_to_pfn_cache with invalidation support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This can be used in two modes. There is an atomic mode where the cached mapping is accessed while holding the rwlock, and a mode where the physical address is used by a vCPU in guest mode. For the latter case, an invalidation will wake the vCPU with the new KVM_REQ_GPC_INVALIDATE, and the architecture will need to refresh any caches it still needs to access before entering guest mode again. Only one vCPU can be targeted by the wake requests; it's simple enough to make it wake all vCPUs or even a mask but I don't see a use case for that additional complexity right now. Invalidation happens from the invalidate_range_start MMU notifier, which needs to be able to sleep in order to wake the vCPU and wait for it. This means that revalidation potentially needs to "wait" for the MMU operation to complete and the invalidate_range_end notifier to be invoked. Like the vCPU when it takes a page fault in that period, we just spin — fixing that in a future patch by implementing an actual *wait* may be another part of shaving this particularly hirsute yak. As noted in the comments in the function itself, the only case where the invalidate_range_start notifier is expected to be called *without* being able to sleep is when the OOM reaper is killing the process. In that case, we expect the vCPU threads already to have exited, and thus there will be nothing to wake, and no reason to wait. So we clear the KVM_REQUEST_WAIT bit and send the request anyway, then complain loudly if there actually *was* anything to wake up. Signed-off-by: David Woodhouse Message-Id: <20211210163625.2886-3-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 103 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/kvm_types.h | 18 ++++++++ 2 files changed, 121 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f9bbcf519280..9bbb1f1d9e48 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -155,6 +155,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_UNBLOCK 2 #define KVM_REQ_UNHALT 3 #define KVM_REQ_VM_DEAD (4 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_GPC_INVALIDATE (5 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQUEST_ARCH_BASE 8 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \ @@ -593,6 +594,10 @@ struct kvm { unsigned long mn_active_invalidate_count; struct rcuwait mn_memslots_update_rcuwait; + /* For management / invalidation of gfn_to_pfn_caches */ + spinlock_t gpc_lock; + struct list_head gpc_list; + /* * created_vcpus is protected by kvm->lock, and is incremented * at the beginning of KVM_CREATE_VCPU. online_vcpus is only @@ -1099,6 +1104,104 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, unsigned long len); void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); +/** + * kvm_gfn_to_pfn_cache_init - prepare a cached kernel mapping and HPA for a + * given guest physical address. + * + * @kvm: pointer to kvm instance. + * @gpc: struct gfn_to_pfn_cache object. + * @vcpu: vCPU to be used for marking pages dirty and to be woken on + * invalidation. + * @guest_uses_pa: indicates that the resulting host physical PFN is used while + * @vcpu is IN_GUEST_MODE so invalidations should wake it. + * @kernel_map: requests a kernel virtual mapping (kmap / memremap). + * @gpa: guest physical address to map. + * @len: sanity check; the range being access must fit a single page. + * @dirty: mark the cache dirty immediately. + * + * @return: 0 for success. + * -EINVAL for a mapping which would cross a page boundary. + * -EFAULT for an untranslatable guest physical address. + * + * This primes a gfn_to_pfn_cache and links it into the @kvm's list for + * invalidations to be processed. Invalidation callbacks to @vcpu using + * %KVM_REQ_GPC_INVALIDATE will occur only for MMU notifiers, not for KVM + * memslot changes. Callers are required to use kvm_gfn_to_pfn_cache_check() + * to ensure that the cache is valid before accessing the target page. + */ +int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, + struct kvm_vcpu *vcpu, bool guest_uses_pa, + bool kernel_map, gpa_t gpa, unsigned long len, + bool dirty); + +/** + * kvm_gfn_to_pfn_cache_check - check validity of a gfn_to_pfn_cache. + * + * @kvm: pointer to kvm instance. + * @gpc: struct gfn_to_pfn_cache object. + * @gpa: current guest physical address to map. + * @len: sanity check; the range being access must fit a single page. + * @dirty: mark the cache dirty immediately. + * + * @return: %true if the cache is still valid and the address matches. + * %false if the cache is not valid. + * + * Callers outside IN_GUEST_MODE context should hold a read lock on @gpc->lock + * while calling this function, and then continue to hold the lock until the + * access is complete. + * + * Callers in IN_GUEST_MODE may do so without locking, although they should + * still hold a read lock on kvm->scru for the memslot checks. + */ +bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, + gpa_t gpa, unsigned long len); + +/** + * kvm_gfn_to_pfn_cache_refresh - update a previously initialized cache. + * + * @kvm: pointer to kvm instance. + * @gpc: struct gfn_to_pfn_cache object. + * @gpa: updated guest physical address to map. + * @len: sanity check; the range being access must fit a single page. + * @dirty: mark the cache dirty immediately. + * + * @return: 0 for success. + * -EINVAL for a mapping which would cross a page boundary. + * -EFAULT for an untranslatable guest physical address. + * + * This will attempt to refresh a gfn_to_pfn_cache. Note that a successful + * returm from this function does not mean the page can be immediately + * accessed because it may have raced with an invalidation. Callers must + * still lock and check the cache status, as this function does not return + * with the lock still held to permit access. + */ +int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, + gpa_t gpa, unsigned long len, bool dirty); + +/** + * kvm_gfn_to_pfn_cache_unmap - temporarily unmap a gfn_to_pfn_cache. + * + * @kvm: pointer to kvm instance. + * @gpc: struct gfn_to_pfn_cache object. + * + * This unmaps the referenced page and marks it dirty, if appropriate. The + * cache is left in the invalid state but at least the mapping from GPA to + * userspace HVA will remain cached and can be reused on a subsequent + * refresh. + */ +void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); + +/** + * kvm_gfn_to_pfn_cache_destroy - destroy and unlink a gfn_to_pfn_cache. + * + * @kvm: pointer to kvm instance. + * @gpc: struct gfn_to_pfn_cache object. + * + * This removes a cache from the @kvm's list to be processed on MMU notifier + * invocation. + */ +void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); + void kvm_sigset_activate(struct kvm_vcpu *vcpu); void kvm_sigset_deactivate(struct kvm_vcpu *vcpu); diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 888ef12862c9..dceac12c1ce5 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -19,6 +19,7 @@ struct kvm_memslots; enum kvm_mr_change; #include +#include #include @@ -53,6 +54,23 @@ struct gfn_to_hva_cache { struct kvm_memory_slot *memslot; }; +struct gfn_to_pfn_cache { + u64 generation; + gpa_t gpa; + unsigned long uhva; + struct kvm_memory_slot *memslot; + struct kvm_vcpu *vcpu; + struct list_head list; + rwlock_t lock; + void *khva; + kvm_pfn_t pfn; + bool active; + bool valid; + bool dirty; + bool kernel_map; + bool guest_uses_pa; +}; + #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE /* * Memory caches are used to preallocate memory ahead of various MMU flows, -- cgit v1.2.3 From 14243b387137a4afbe1df5d9dc15182d6657bb79 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 10 Dec 2021 16:36:23 +0000 Subject: KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery This adds basic support for delivering 2 level event channels to a guest. Initially, it only supports delivery via the IRQ routing table, triggered by an eventfd. In order to do so, it has a kvm_xen_set_evtchn_fast() function which will use the pre-mapped shared_info page if it already exists and is still valid, while the slow path through the irqfd_inject workqueue will remap the shared_info page if necessary. It sets the bits in the shared_info page but not the vcpu_info; that is deferred to __kvm_xen_has_interrupt() which raises the vector to the appropriate vCPU. Add a 'verbose' mode to xen_shinfo_test while adding test cases for this. Signed-off-by: David Woodhouse Message-Id: <20211210163625.2886-5-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9bbb1f1d9e48..3c47b146851a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -497,6 +497,12 @@ struct kvm_hv_sint { u32 sint; }; +struct kvm_xen_evtchn { + u32 port; + u32 vcpu; + u32 priority; +}; + struct kvm_kernel_irq_routing_entry { u32 gsi; u32 type; @@ -517,6 +523,7 @@ struct kvm_kernel_irq_routing_entry { } msi; struct kvm_s390_adapter_int adapter; struct kvm_hv_sint hv_sint; + struct kvm_xen_evtchn xen_evtchn; }; struct hlist_node link; }; -- cgit v1.2.3 From 44ea62813f0ab3d718de480504f4dfd0bdd01858 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Sep 2021 18:31:40 -0700 Subject: spi: don't include ptp_clock_kernel.h in spi.h Commit b42faeee718c ("spi: Add a PTP system timestamp to the transfer structure") added an include of ptp_clock_kernel.h to spi.h for struct ptp_system_timestamp but a forward declaration is enough. Let's use that to limit the number of objects we have to rebuild every time we touch networking headers. Signed-off-by: Jakub Kicinski Tested-by: Vladimir Oltean Link: https://lore.kernel.org/r/20210904013140.2377609-1-kuba@kernel.org Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index eb7ac8a1e03c..7ab3fed7b804 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -14,12 +14,12 @@ #include #include #include -#include #include struct dma_chan; struct software_node; +struct ptp_system_timestamp; struct spi_controller; struct spi_transfer; struct spi_controller_mem_ops; -- cgit v1.2.3 From 3506659e18a61ae525f3b9b4f5af23b4b149d4db Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 28 Nov 2021 14:53:35 -0500 Subject: mm: Add unmap_mapping_folio() Convert both callers of unmap_mapping_page() to call unmap_mapping_folio() instead. Also move zap_details from linux/mm.h to mm/memory.c Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski --- include/linux/mm.h | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 145f045b0ddc..c9cdb26802fb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1825,28 +1825,6 @@ static inline bool can_do_mlock(void) { return false; } extern int user_shm_lock(size_t, struct ucounts *); extern void user_shm_unlock(size_t, struct ucounts *); -/* - * Parameter block passed down to zap_pte_range in exceptional cases. - */ -struct zap_details { - struct address_space *zap_mapping; /* Check page->mapping if set */ - struct page *single_page; /* Locked page to be unmapped */ -}; - -/* - * We set details->zap_mappings when we want to unmap shared but keep private - * pages. Return true if skip zapping this page, false otherwise. - */ -static inline bool -zap_skip_check_mapping(struct zap_details *details, struct page *page) -{ - if (!details || !page) - return false; - - return details->zap_mapping && - (details->zap_mapping != page_rmapping(page)); -} - struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -1892,7 +1870,6 @@ extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, extern int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); -void unmap_mapping_page(struct page *page); void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows); void unmap_mapping_range(struct address_space *mapping, @@ -1913,7 +1890,6 @@ static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address, BUG(); return -EFAULT; } -static inline void unmap_mapping_page(struct page *page) { } static inline void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { } static inline void unmap_mapping_range(struct address_space *mapping, -- cgit v1.2.3 From 1e84a3d997b74c33491899e31d48774f252213ab Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 2 Dec 2021 16:01:55 -0500 Subject: truncate,shmem: Add truncate_inode_folio() Convert all callers of truncate_inode_page() to call truncate_inode_folio() instead, and move the declaration to mm/internal.h. Move the assertion that the caller is not passing in a tail page to generic_error_remove_page(). We can't entirely remove the struct page from the callers yet because the page pointer in the pvec might be a shadow/dax/swap entry instead of actually a page. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c9cdb26802fb..d8b7d7ed14dd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1859,7 +1859,6 @@ extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); -int truncate_inode_page(struct address_space *mapping, struct page *page); int generic_error_remove_page(struct address_space *mapping, struct page *page); int invalidate_inode_page(struct page *page); -- cgit v1.2.3 From 0e499ed3d7a216706e02eeded562627d3e69dcfd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 1 Sep 2020 23:17:50 -0400 Subject: filemap: Return only folios from find_get_entries() The callers have all been converted to work on folios, so convert find_get_entries() to return a batch of folios instead of pages. We also now return multiple large folios in a single call. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig --- include/linux/pagemap.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index eb6e58e106c8..d2259a1da51c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -592,8 +592,6 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index) return head + (index & (thp_nr_pages(head) - 1)); } -unsigned find_get_entries(struct address_space *mapping, pgoff_t start, - pgoff_t end, struct pagevec *pvec, pgoff_t *indices); unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, pgoff_t end, unsigned int nr_pages, struct page **pages); -- cgit v1.2.3 From 51dcbdac28d4dde915f78adf08bb3fac87f516e9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 7 Dec 2021 14:15:07 -0500 Subject: mm: Convert find_lock_entries() to use a folio_batch find_lock_entries() already only returned the head page of folios, so convert it to return a folio_batch instead of a pagevec. That cascades through converting truncate_inode_pages_range() to delete_from_page_cache_batch() and page_cache_delete_batch(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski --- include/linux/pagemap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d2259a1da51c..6e038811f4c8 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -16,7 +16,7 @@ #include /* for in_interrupt() */ #include -struct pagevec; +struct folio_batch; static inline bool mapping_empty(struct address_space *mapping) { @@ -936,7 +936,7 @@ static inline void __delete_from_page_cache(struct page *page, void *shadow) } void replace_page_cache_page(struct page *old, struct page *new); void delete_from_page_cache_batch(struct address_space *mapping, - struct pagevec *pvec); + struct folio_batch *fbatch); int try_to_release_page(struct page *page, gfp_t gfp); bool filemap_release_folio(struct folio *folio, gfp_t gfp); loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end, -- cgit v1.2.3 From 1613fac9aaf840af76faa747ea428a714af98dbd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 7 Dec 2021 14:28:49 -0500 Subject: mm: Remove pagevec_remove_exceptionals() All of its callers now call folio_batch_remove_exceptionals(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski --- include/linux/pagevec.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index c3fa616d7ae7..dda8d5868c81 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -27,7 +27,6 @@ struct pagevec { void __pagevec_release(struct pagevec *pvec); void __pagevec_lru_add(struct pagevec *pvec); -void pagevec_remove_exceptionals(struct pagevec *pvec); unsigned pagevec_lookup_range(struct pagevec *pvec, struct address_space *mapping, pgoff_t *start, pgoff_t end); @@ -146,8 +145,5 @@ static inline void folio_batch_release(struct folio_batch *fbatch) pagevec_release((struct pagevec *)fbatch); } -static inline void folio_batch_remove_exceptionals(struct folio_batch *fbatch) -{ - pagevec_remove_exceptionals((struct pagevec *)fbatch); -} +void folio_batch_remove_exceptionals(struct folio_batch *fbatch); #endif /* _LINUX_PAGEVEC_H */ -- cgit v1.2.3 From 25a8de7f8d970ffa7263bd9d32a08138cd949f17 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 27 Aug 2021 07:21:49 -0400 Subject: XArray: Add xas_advance() Add a new helper function to help iterate over multi-index entries. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski --- include/linux/xarray.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index a91e3d90df8a..d6d5da6ed735 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1580,6 +1580,24 @@ static inline void xas_set(struct xa_state *xas, unsigned long index) xas->xa_node = XAS_RESTART; } +/** + * xas_advance() - Skip over sibling entries. + * @xas: XArray operation state. + * @index: Index of last sibling entry. + * + * Move the operation state to refer to the last sibling entry. + * This is useful for loops that normally want to see sibling + * entries but sometimes want to skip them. Use xas_set() if you + * want to move to an index which is not part of this entry. + */ +static inline void xas_advance(struct xa_state *xas, unsigned long index) +{ + unsigned char shift = xas_is_node(xas) ? xas->xa_node->shift : 0; + + xas->xa_index = index; + xas->xa_offset = (index >> shift) & XA_CHUNK_MASK; +} + /** * xas_set_order() - Set up XArray operation state for a multislot entry. * @xas: XArray operation state. -- cgit v1.2.3 From 6b24ca4a1a8d4ee3221d6d44ddbb99f542e4bda3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 27 Jun 2020 22:19:08 -0400 Subject: mm: Use multi-index entries in the page cache We currently store large folios as 2^N consecutive entries. While this consumes rather more memory than necessary, it also turns out to be buggy. A writeback operation which starts within a tail page of a dirty folio will not write back the folio as the xarray's dirty bit is only set on the head index. With multi-index entries, the dirty bit will be found no matter where in the folio the operation starts. This does end up simplifying the page cache slightly, although not as much as I had hoped. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski --- include/linux/pagemap.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 6e038811f4c8..704cb1b4b15d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1125,16 +1125,6 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, VM_BUG_ON_PAGE(PageTail(page), page); array[i++] = page; rac->_batch_count += thp_nr_pages(page); - - /* - * The page cache isn't using multi-index entries yet, - * so the xas cursor needs to be manually moved to the - * next index. This can be removed once the page cache - * is converted. - */ - if (PageHead(page)) - xas_set(&xas, rac->_index + rac->_batch_count); - if (i == array_sz) break; } -- cgit v1.2.3 From e32cf5dfbe227b355776948b2c9b5691b84d1cbd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 22 Dec 2021 22:10:09 -0600 Subject: kthread: Generalize pf_io_worker so it can point to struct kthread The point of using set_child_tid to hold the kthread pointer was that it already did what is necessary. There are now restrictions on when set_child_tid can be initialized and when set_child_tid can be used in schedule_tail. Which indicates that continuing to use set_child_tid to hold the kthread pointer is a bad idea. Instead of continuing to use the set_child_tid field of task_struct generalize the pf_io_worker field of task_struct and use it to hold the kthread pointer. Rename pf_io_worker (which is a void * pointer) to worker_private so it can be used to store kthreads struct kthread pointer. Update the kthread code to store the kthread pointer in the worker_private field. Remove the places where set_child_tid had to be dealt with carefully because kthreads also used it. Link: https://lkml.kernel.org/r/CAHk-=wgtFAA9SbVYg0gR1tqPMC17-NYcs0GQkaYg1bGhh1uJQQ@mail.gmail.com Link: https://lkml.kernel.org/r/87a6grvqy8.fsf_-_@email.froward.int.ebiederm.org Suggested-by: Linus Torvalds Signed-off-by: "Eric W. Biederman" --- include/linux/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 78c351e35fec..52f2fdffa3ab 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -987,8 +987,8 @@ struct task_struct { /* CLONE_CHILD_CLEARTID: */ int __user *clear_child_tid; - /* PF_IO_WORKER */ - void *pf_io_worker; + /* PF_KTHREAD | PF_IO_WORKER */ + void *worker_private; u64 utime; u64 stime; -- cgit v1.2.3 From 3367d1bd738c01b2737eaab7d922bfe5f1a41f38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 8 Jan 2022 16:31:58 +0100 Subject: power: supply: Provide stubs for charge_behaviour helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_SYSFS is not enabled provide stubs for the helper functions to not break their callers. Fixes: 539b9c94ac83 ("power: supply: add helpers for charge_behaviour sysfs") Reported-by: kernel test robot Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20220108153158.189489-1-linux@weissschuh.net Signed-off-by: Hans de Goede --- include/linux/power_supply.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 71f0379c2af8..f6b9ed4630fa 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -553,6 +553,21 @@ ssize_t power_supply_charge_behaviour_show(struct device *dev, char *buf); int power_supply_charge_behaviour_parse(unsigned int available_behaviours, const char *buf); +#else +static inline +ssize_t power_supply_charge_behaviour_show(struct device *dev, + unsigned int available_behaviours, + enum power_supply_charge_behaviour behaviour, + char *buf) +{ + return -EOPNOTSUPP; +} + +static inline int power_supply_charge_behaviour_parse(unsigned int available_behaviours, + const char *buf) +{ + return -EOPNOTSUPP; +} #endif #endif /* __LINUX_POWER_SUPPLY_H__ */ -- cgit v1.2.3 From 2f824d4d197e02275562359a2ae5274177ce500c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 8 Jan 2022 09:48:31 -0600 Subject: signal: Remove SIGNAL_GROUP_COREDUMP After the previous cleanups "signal->core_state" is set whenever SIGNAL_GROUP_COREDUMP is set and "signal->core_state" is tested whenver the code wants to know if a coredump is in progress. The remaining tests of SIGNAL_GROUP_COREDUMP also test to see if SIGNAL_GROUP_EXIT is set. Similarly the only place that sets SIGNAL_GROUP_COREDUMP also sets SIGNAL_GROUP_EXIT. Which makes SIGNAL_GROUP_COREDUMP unecessary and redundant. So stop setting SIGNAL_GROUP_COREDUMP, stop testing SIGNAL_GROUP_COREDUMP, and remove it's definition. With the setting of SIGNAL_GROUP_COREDUMP gone, coredump_finish no longer needs to clear SIGNAL_GROUP_COREDUMP out of signal->flags by setting SIGNAL_GROUP_EXIT. Link: https://lkml.kernel.org/r/20211213225350.27481-5-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index fa26d2a58413..ecc10e148799 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -256,7 +256,6 @@ struct signal_struct { #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ #define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ -#define SIGNAL_GROUP_COREDUMP 0x00000008 /* coredump in progress */ /* * Pending notifications to parent. */ @@ -272,7 +271,7 @@ struct signal_struct { static inline void signal_set_stop_flags(struct signal_struct *sig, unsigned int flags) { - WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP)); + WARN_ON(sig->flags & SIGNAL_GROUP_EXIT); sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; } -- cgit v1.2.3 From 60700e38fb68e800607ca7a027060d5419fc5798 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 6 Jun 2021 13:47:53 -0500 Subject: signal: Rename group_exit_task group_exec_task The only remaining user of group_exit_task is exec. Rename the field so that it is clear which part of the code uses it. Update the comment above the definition of group_exec_task to document how it is currently used. Link: https://lkml.kernel.org/r/20211213225350.27481-7-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index ecc10e148799..d3248aba5183 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -109,13 +109,9 @@ struct signal_struct { /* thread group exit support */ int group_exit_code; - /* overloaded: - * - notify group_exit_task when ->count is equal to notify_count - * - everyone except group_exit_task is stopped during signal delivery - * of fatal signals, group_exit_task processes the signal. - */ + /* notify group_exec_task when notify_count is less or equal to 0 */ int notify_count; - struct task_struct *group_exit_task; + struct task_struct *group_exec_task; /* thread group stop support, overloads group_exit_code too */ int group_stop_count; @@ -275,11 +271,11 @@ static inline void signal_set_stop_flags(struct signal_struct *sig, sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; } -/* If true, all threads except ->group_exit_task have pending SIGKILL */ +/* If true, all threads except ->group_exec_task have pending SIGKILL */ static inline int signal_group_exit(const struct signal_struct *sig) { return (sig->flags & SIGNAL_GROUP_EXIT) || - (sig->group_exit_task != NULL); + (sig->group_exec_task != NULL); } extern void flush_signals(struct task_struct *); -- cgit v1.2.3 From 49697335e0b441b0553598c1b48ee9ebb053d2f1 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 24 Jun 2021 02:14:30 -0500 Subject: signal: Remove the helper signal_group_exit This helper is misleading. It tests for an ongoing exec as well as the process having received a fatal signal. Sometimes it is appropriate to treat an on-going exec differently than a process that is shutting down due to a fatal signal. In particular taking the fast path out of exit_signals instead of retargeting signals is not appropriate during exec, and not changing the the exit code in do_group_exit during exec. Removing the helper makes it more obvious what is going on as both cases must be coded for explicitly. While removing the helper fix the two cases where I have observed using signal_group_exit resulted in the wrong result. In exit_signals only test for SIGNAL_GROUP_EXIT so that signals are retargetted during an exec. In do_group_exit use 0 as the exit code during an exec as de_thread does not set group_exit_code. As best as I can determine group_exit_code has been is set to 0 most of the time during de_thread. During a thread group stop group_exit_code is set to the stop signal and when the thread group receives SIGCONT group_exit_code is reset to 0. Link: https://lkml.kernel.org/r/20211213225350.27481-8-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index d3248aba5183..b6ecb9fc4cd2 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -271,13 +271,6 @@ static inline void signal_set_stop_flags(struct signal_struct *sig, sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; } -/* If true, all threads except ->group_exec_task have pending SIGKILL */ -static inline int signal_group_exit(const struct signal_struct *sig) -{ - return (sig->flags & SIGNAL_GROUP_EXIT) || - (sig->group_exec_task != NULL); -} - extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); -- cgit v1.2.3 From 2d4bcf886e42f0f4846a3d9bdc3a90d278903a2e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 8 Jan 2022 11:23:02 -0600 Subject: exit: Remove profile_task_exit & profile_munmap When I say remove I mean remove. All profile_task_exit and profile_munmap do is call a blocking notifier chain. The helpers profile_task_register and profile_task_unregister are not called anywhere in the tree. Which means this is all dead code. So remove the dead code and make it easier to read do_exit. Reviewed-by: Christoph Hellwig Link: https://lkml.kernel.org/r/20220103213312.9144-1-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/profile.h | 26 -------------------------- 1 file changed, 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/profile.h b/include/linux/profile.h index fd18ca96f557..f7eb2b57d890 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h @@ -31,11 +31,6 @@ static inline int create_proc_profile(void) } #endif -enum profile_type { - PROFILE_TASK_EXIT, - PROFILE_MUNMAP -}; - #ifdef CONFIG_PROFILING extern int prof_on __read_mostly; @@ -66,23 +61,14 @@ static inline void profile_hit(int type, void *ip) struct task_struct; struct mm_struct; -/* task is in do_exit() */ -void profile_task_exit(struct task_struct * task); - /* task is dead, free task struct ? Returns 1 if * the task was taken, 0 if the task should be freed. */ int profile_handoff_task(struct task_struct * task); -/* sys_munmap */ -void profile_munmap(unsigned long addr); - int task_handoff_register(struct notifier_block * n); int task_handoff_unregister(struct notifier_block * n); -int profile_event_register(enum profile_type, struct notifier_block * n); -int profile_event_unregister(enum profile_type, struct notifier_block * n); - #else #define prof_on 0 @@ -117,19 +103,7 @@ static inline int task_handoff_unregister(struct notifier_block * n) return -ENOSYS; } -static inline int profile_event_register(enum profile_type t, struct notifier_block * n) -{ - return -ENOSYS; -} - -static inline int profile_event_unregister(enum profile_type t, struct notifier_block * n) -{ - return -ENOSYS; -} - -#define profile_task_exit(a) do { } while (0) #define profile_handoff_task(a) (0) -#define profile_munmap(a) do { } while (0) #endif /* CONFIG_PROFILING */ -- cgit v1.2.3 From 2873cd31a20c25b5e763b35e5fb886f0938c6dd5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 8 Jan 2022 10:03:24 -0600 Subject: exit: Remove profile_handoff_task All profile_handoff_task does is notify the task_free_notifier chain. The helpers task_handoff_register and task_handoff_unregister are used to add and delete entries from that chain and are never called. So remove the dead code and make it much easier to read and reason about __put_task_struct. Suggested-by: Al Viro Link: https://lkml.kernel.org/r/87fspyw6m0.fsf@email.froward.int.ebiederm.org Signed-off-by: "Eric W. Biederman" --- include/linux/profile.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/profile.h b/include/linux/profile.h index f7eb2b57d890..11db1ec516e2 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h @@ -61,14 +61,6 @@ static inline void profile_hit(int type, void *ip) struct task_struct; struct mm_struct; -/* task is dead, free task struct ? Returns 1 if - * the task was taken, 0 if the task should be freed. - */ -int profile_handoff_task(struct task_struct * task); - -int task_handoff_register(struct notifier_block * n); -int task_handoff_unregister(struct notifier_block * n); - #else #define prof_on 0 @@ -93,17 +85,6 @@ static inline void profile_hit(int type, void *ip) return; } -static inline int task_handoff_register(struct notifier_block * n) -{ - return -ENOSYS; -} - -static inline int task_handoff_unregister(struct notifier_block * n) -{ - return -ENOSYS; -} - -#define profile_handoff_task(a) (0) #endif /* CONFIG_PROFILING */ -- cgit v1.2.3 From 4264178416cd52a55a3eccbefb3973866e060280 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 20 Dec 2021 16:28:53 -0600 Subject: ptrace: Remove unused regs argument from ptrace_report_syscall Link: https://lkml.kernel.org/r/20220103213312.9144-7-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/tracehook.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 2564b7434b4d..88c007ab5ebc 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -54,8 +54,7 @@ struct linux_binprm; /* * ptrace report for syscall entry and exit looks identical. */ -static inline int ptrace_report_syscall(struct pt_regs *regs, - unsigned long message) +static inline int ptrace_report_syscall(unsigned long message) { int ptrace = current->ptrace; @@ -102,7 +101,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs, static inline __must_check int tracehook_report_syscall_entry( struct pt_regs *regs) { - return ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_ENTRY); + return ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_ENTRY); } /** @@ -127,7 +126,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step) if (step) user_single_step_report(regs); else - ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_EXIT); + ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_EXIT); } /** -- cgit v1.2.3 From 40595cdc93edf4110c0f0c0b06f8d82008f23929 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 16 Dec 2021 12:20:13 -0500 Subject: nfs: block notification on fs with its own ->lock NFSv4.1 supports an optional lock notification feature which notifies the client when a lock comes available. (Normally NFSv4 clients just poll for locks if necessary.) To make that work, we need to request a blocking lock from the filesystem. We turned that off for NFS in commit f657f8eef3ff ("nfs: don't atempt blocking locks on nfs reexports") [sic] because it actually blocks the nfsd thread while waiting for the lock. Thanks to Vasily Averin for pointing out that NFS isn't the only filesystem with that problem. Any filesystem that leaves ->lock NULL will use posix_lock_file(), which does the right thing. Simplest is just to assume that any filesystem that defines its own ->lock is not safe to request a blocking lock from. So, this patch mostly reverts commit f657f8eef3ff ("nfs: don't atempt blocking locks on nfs reexports") [sic] and commit b840be2f00c0 ("lockd: don't attempt blocking locks on nfs reexports"), and instead uses a check of ->lock (Vasily's suggestion) to decide whether to support blocking lock notifications on a given filesystem. Also add a little documentation. Perhaps someday we could add back an export flag later to allow filesystems with "good" ->lock methods to support blocking lock notifications. Reported-by: Vasily Averin Signed-off-by: J. Bruce Fields [ cel: Description rewritten to address checkpatch nits ] [ cel: Fixed warning when SUNRPC debugging is disabled ] [ cel: Fixed NULL check ] Signed-off-by: Chuck Lever Reviewed-by: Vasily Averin --- include/linux/exportfs.h | 2 -- include/linux/lockd/lockd.h | 9 +++++++-- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 3260fe714846..fe848901fcc3 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -221,8 +221,6 @@ struct export_operations { #define EXPORT_OP_NOATOMIC_ATTR (0x10) /* Filesystem cannot supply atomic attribute updates */ -#define EXPORT_OP_SYNC_LOCKS (0x20) /* Filesystem can't do - asychronous blocking locks */ unsigned long flags; }; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index c4ae6506b8b3..fcef192e5e45 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -303,10 +303,15 @@ void nlmsvc_invalidate_all(void); int nlmsvc_unlock_all_by_sb(struct super_block *sb); int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); +static inline struct file *nlmsvc_file_file(struct nlm_file *file) +{ + return file->f_file[O_RDONLY] ? + file->f_file[O_RDONLY] : file->f_file[O_WRONLY]; +} + static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) { - return locks_inode(file->f_file[O_RDONLY] ? - file->f_file[O_RDONLY] : file->f_file[O_WRONLY]); + return locks_inode(nlmsvc_file_file(file)); } static inline int __nlm_privileged_request4(const struct sockaddr *sap) -- cgit v1.2.3 From 0aa698787aa2a9e8840987e54ba2982559de6404 Mon Sep 17 00:00:00 2001 From: axelj Date: Mon, 13 Dec 2021 08:09:25 +0100 Subject: tpm: Add Upgrade/Reduced mode support for TPM2 modules If something went wrong during the TPM firmware upgrade, like power failure or the firmware image file get corrupted, the TPM might end up in Upgrade or Failure mode upon the next start. The state is persistent between the TPM power cycle/restart. According to TPM specification: * If the TPM is in Upgrade mode, it will answer with TPM2_RC_UPGRADE to all commands except TPM2_FieldUpgradeData(). It may also accept other commands if it is able to complete them using the previously installed firmware. * If the TPM is in Failure mode, it will allow performing TPM initialization but will not provide any crypto operations. Will happily respond to Field Upgrade calls. Change the behavior of the tpm2_auto_startup(), so it detects the active running mode of the TPM by adding the following checks. If tpm2_do_selftest() call returns TPM2_RC_UPGRADE, the TPM is in Upgrade mode. If the TPM is in Failure mode, it will successfully respond to both tpm2_do_selftest() and tpm2_startup() calls. Although, will fail to answer to tpm2_get_cc_attrs_tbl(). Use this fact to conclude that TPM is in Failure mode. If detected that the TPM is in the Upgrade or Failure mode, the function sets TPM_CHIP_FLAG_FIRMWARE_UPGRADE_MODE flag. The TPM_CHIP_FLAG_FIRMWARE_UPGRADE_MODE flag is used later during driver initialization/deinitialization to disable functionality which makes no sense or will fail in the current TPM state. Following functionality is affected: * Do not register TPM as a hwrng * Do not register sysfs entries which provide information impossible to obtain in limited mode * Do not register resource managed character device Signed-off-by: axelj Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- include/linux/tpm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 12d827734686..dfeb25a0362d 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -207,6 +207,7 @@ enum tpm2_return_codes { TPM2_RC_INITIALIZE = 0x0100, /* RC_VER1 */ TPM2_RC_FAILURE = 0x0101, TPM2_RC_DISABLED = 0x0120, + TPM2_RC_UPGRADE = 0x012D, TPM2_RC_COMMAND_CODE = 0x0143, TPM2_RC_TESTING = 0x090A, /* RC_WARN */ TPM2_RC_REFERENCE_H0 = 0x0910, @@ -278,6 +279,7 @@ enum tpm_chip_flags { TPM_CHIP_FLAG_HAVE_TIMEOUTS = BIT(4), TPM_CHIP_FLAG_ALWAYS_POWERED = BIT(5), TPM_CHIP_FLAG_FIRMWARE_POWER_MANAGED = BIT(6), + TPM_CHIP_FLAG_FIRMWARE_UPGRADE = BIT(7), }; #define to_tpm_chip(d) container_of(d, struct tpm_chip, dev) @@ -399,6 +401,14 @@ static inline void tpm_buf_append_u32(struct tpm_buf *buf, const u32 value) tpm_buf_append(buf, (u8 *) &value2, 4); } +/* + * Check if TPM device is in the firmware upgrade mode. + */ +static inline bool tpm_is_firmware_upgrade(struct tpm_chip *chip) +{ + return chip->flags & TPM_CHIP_FLAG_FIRMWARE_UPGRADE; +} + static inline u32 tpm2_rc_value(u32 rc) { return (rc & BIT(7)) ? rc & 0xff : rc; -- cgit v1.2.3 From 292c33c95defd0b814fec1fc8cd60d16556cf7b8 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 7 Jan 2022 08:52:28 +0800 Subject: block: fix old-style declaration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the 'inline' keyword to the front of 'void'. Remove a warning found by clang(make W=1 LLVM=1) ./include/linux/blk-mq.h:259:1: warning: ‘inline’ is not at beginning of declaration Reported-by: Abaci Robot Signed-off-by: Yang Li Link: https://lore.kernel.org/r/20220107005228.103927-1-yang.lee@linux.alibaba.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f40a05ecca4a..d319ffa59354 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -256,7 +256,7 @@ static inline unsigned short req_get_ioprio(struct request *req) * @rq: The request to move * @prev: The request preceding @rq in @src (NULL if @rq is the head) */ -static void inline rq_list_move(struct request **src, struct request **dst, +static inline void rq_list_move(struct request **src, struct request **dst, struct request *rq, struct request *prev) { if (prev) -- cgit v1.2.3 From 0ea9fc15b1d7d6636d429e74ffe3f86bf2f2f7d6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 23 Nov 2021 17:05:07 +0100 Subject: fs/locks: fix fcntl_getlk64/fcntl_setlk64 stub prototypes My patch to rework oabi fcntl64() introduced a harmless sparse warning when file locking is disabled: arch/arm/kernel/sys_oabi-compat.c:251:51: sparse: sparse: incorrect type in argument 3 (different address spaces) @@ expected struct flock64 [noderef] __user *user @@ got struct flock64 * @@ arch/arm/kernel/sys_oabi-compat.c:251:51: sparse: expected struct flock64 [noderef] __user *user arch/arm/kernel/sys_oabi-compat.c:251:51: sparse: got struct flock64 * arch/arm/kernel/sys_oabi-compat.c:265:55: sparse: sparse: incorrect type in argument 4 (different address spaces) @@ expected struct flock64 [noderef] __user *user @@ got struct flock64 * @@ arch/arm/kernel/sys_oabi-compat.c:265:55: sparse: expected struct flock64 [noderef] __user *user arch/arm/kernel/sys_oabi-compat.c:265:55: sparse: got struct flock64 * When file locking is enabled, everything works correctly and the right data gets passed, but the stub declarations in linux/fs.h did not get modified when the calling conventions changed in an earlier patch. Reported-by: kernel test robot Fixes: 7e2d8c29ecdd ("ARM: 9111/1: oabi-compat: rework fcntl64() emulation") Fixes: a75d30c77207 ("fs/locks: pass kernel struct flock to fcntl_getlk/setlk") Cc: Christoph Hellwig Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner Signed-off-by: Arnd Bergmann Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index bbf812ce89a8..5122d13775c2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1220,13 +1220,13 @@ static inline int fcntl_setlk(unsigned int fd, struct file *file, #if BITS_PER_LONG == 32 static inline int fcntl_getlk64(struct file *file, unsigned int cmd, - struct flock64 __user *user) + struct flock64 *user) { return -EINVAL; } static inline int fcntl_setlk64(unsigned int fd, struct file *file, - unsigned int cmd, struct flock64 __user *user) + unsigned int cmd, struct flock64 *user) { return -EACCES; } -- cgit v1.2.3 From 719774377622bc4025d2a74f551b5dc2158c6c30 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 05:03:22 +0100 Subject: netfilter: conntrack: convert to refcount_t api Convert nf_conn reference counting from atomic_t to refcount_t based api. refcount_t api provides more runtime sanity checks and will warn on certain constructs, e.g. refcount_inc() on a zero reference count, which usually indicates use-after-free. For this reason template allocation is changed to init the refcount to 1, the subsequenct add operations are removed. Likewise, init_conntrack() is changed to set the initial refcount to 1 instead refcount_inc(). This is safe because the new entry is not (yet) visible to other cpus. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_common.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 700ea077ce2d..a03f7a80b9ab 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -2,7 +2,7 @@ #ifndef _NF_CONNTRACK_COMMON_H #define _NF_CONNTRACK_COMMON_H -#include +#include #include struct ip_conntrack_stat { @@ -25,19 +25,19 @@ struct ip_conntrack_stat { #define NFCT_PTRMASK ~(NFCT_INFOMASK) struct nf_conntrack { - atomic_t use; + refcount_t use; }; void nf_conntrack_destroy(struct nf_conntrack *nfct); static inline void nf_conntrack_put(struct nf_conntrack *nfct) { - if (nfct && atomic_dec_and_test(&nfct->use)) + if (nfct && refcount_dec_and_test(&nfct->use)) nf_conntrack_destroy(nfct); } static inline void nf_conntrack_get(struct nf_conntrack *nfct) { if (nfct) - atomic_inc(&nfct->use); + refcount_inc(&nfct->use); } #endif /* _NF_CONNTRACK_COMMON_H */ -- cgit v1.2.3 From 3fce16493dc1aa2c9af3d7e7bd360dfe203a3e6a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 05:03:23 +0100 Subject: netfilter: core: move ip_ct_attach indirection to struct nf_ct_hook ip_ct_attach predates struct nf_ct_hook, we can place it there and remove the exported symbol. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 3fda1a508733..e0e3f3355ab1 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -440,7 +440,6 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include -extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu; void nf_ct_attach(struct sk_buff *, const struct sk_buff *); struct nf_conntrack_tuple; bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, @@ -463,6 +462,7 @@ struct nf_ct_hook { void (*destroy)(struct nf_conntrack *); bool (*get_tuple_skb)(struct nf_conntrack_tuple *, const struct sk_buff *); + void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); }; extern struct nf_ct_hook __rcu *nf_ct_hook; -- cgit v1.2.3 From 285c8a7a58158cb1805c97ff03875df2ba2ea1fe Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 05:03:24 +0100 Subject: netfilter: make function op structures const No functional changes, these structures should be const. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index e0e3f3355ab1..15e71bfff726 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -381,13 +381,13 @@ struct nf_nat_hook { enum ip_conntrack_dir dir); }; -extern struct nf_nat_hook __rcu *nf_nat_hook; +extern const struct nf_nat_hook __rcu *nf_nat_hook; static inline void nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) { #if IS_ENABLED(CONFIG_NF_NAT) - struct nf_nat_hook *nat_hook; + const struct nf_nat_hook *nat_hook; rcu_read_lock(); nat_hook = rcu_dereference(nf_nat_hook); @@ -464,7 +464,7 @@ struct nf_ct_hook { const struct sk_buff *); void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); }; -extern struct nf_ct_hook __rcu *nf_ct_hook; +extern const struct nf_ct_hook __rcu *nf_ct_hook; struct nlattr; @@ -479,7 +479,7 @@ struct nfnl_ct_hook { void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off); }; -extern struct nfnl_ct_hook __rcu *nfnl_ct_hook; +extern const struct nfnl_ct_hook __rcu *nfnl_ct_hook; /** * nf_skb_duplicated - TEE target has sent a packet -- cgit v1.2.3 From 6ae7989c9af0d98ab64196f4f4c6f6499454bd23 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 05:03:25 +0100 Subject: netfilter: conntrack: avoid useless indirection during conntrack destruction nf_ct_put() results in a usesless indirection: nf_ct_put -> nf_conntrack_put -> nf_conntrack_destroy -> rcu readlock + indirect call of ct_hooks->destroy(). There are two _put helpers: nf_ct_put and nf_conntrack_put. The latter is what should be used in code that MUST NOT cause a linker dependency on the conntrack module (e.g. calls from core network stack). Everyone else should call nf_ct_put() instead. A followup patch will convert a few nf_conntrack_put() calls to nf_ct_put(), in particular from modules that already have a conntrack dependency such as act_ct or even nf_conntrack itself. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_common.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index a03f7a80b9ab..2770db2fa080 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -29,6 +29,8 @@ struct nf_conntrack { }; void nf_conntrack_destroy(struct nf_conntrack *nfct); + +/* like nf_ct_put, but without module dependency on nf_conntrack */ static inline void nf_conntrack_put(struct nf_conntrack *nfct) { if (nfct && refcount_dec_and_test(&nfct->use)) -- cgit v1.2.3 From 6316136ec6e3dd1c302f7e7289a9ee46ecc610ae Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 15:46:16 +0100 Subject: netfilter: egress: avoid a lockdep splat include/linux/netfilter_netdev.h:97 suspicious rcu_dereference_check() usage! 2 locks held by sd-resolve/1100: 0: ..(rcu_read_lock_bh){1:3}, at: ip_finish_output2 1: ..(rcu_read_lock_bh){1:3}, at: __dev_queue_xmit __dev_queue_xmit+0 .. The helper has two callers, one uses rcu_read_lock, the other rcu_read_lock_bh(). Annotate the dereference to reflect this. Fixes: 42df6e1d221dd ("netfilter: Introduce egress hook") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_netdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h index b71b57a83bb4..b4dd96e4dc8d 100644 --- a/include/linux/netfilter_netdev.h +++ b/include/linux/netfilter_netdev.h @@ -94,7 +94,7 @@ static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, return skb; #endif - e = rcu_dereference(dev->nf_hooks_egress); + e = rcu_dereference_check(dev->nf_hooks_egress, rcu_read_lock_bh_held()); if (!e) return skb; -- cgit v1.2.3 From 6f022c2ddbcefaee79502ce5386dfe351d457070 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Thu, 6 Jan 2022 17:38:04 +0200 Subject: net: openvswitch: Fix ct_state nat flags for conns arriving from tc Netfilter conntrack maintains NAT flags per connection indicating whether NAT was configured for the connection. Openvswitch maintains NAT flags on the per packet flow key ct_state field, indicating whether NAT was actually executed on the packet. When a packet misses from tc to ovs the conntrack NAT flags are set. However, NAT was not necessarily executed on the packet because the connection's state might still be in NEW state. As such, openvswitch wrongly assumes that NAT was executed and sets an incorrect flow key NAT flags. Fix this, by flagging to openvswitch which NAT was actually done in act_ct via tc_skb_ext and tc_skb_cb to the openvswitch module, so the packet flow key NAT flags will be correctly set. Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct") Signed-off-by: Paul Blakey Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20220106153804.26451-1-paulb@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4507d77d6941..60ab0c2fe567 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -287,7 +287,9 @@ struct tc_skb_ext { __u32 chain; __u16 mru; __u16 zone; - bool post_ct; + u8 post_ct:1; + u8 post_ct_snat:1; + u8 post_ct_dnat:1; }; #endif -- cgit v1.2.3 From c504e5c2f9648a1e5c2be01e8c3f59d394192bd3 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 9 Jan 2022 14:36:26 +0800 Subject: net: skb: introduce kfree_skb_reason() Introduce the interface kfree_skb_reason(), which is able to pass the reason why the skb is dropped to 'kfree_skb' tracepoint. Add the 'reason' field to 'trace_kfree_skb', therefor user can get more detail information about abnormal skb with 'drop_monitor' or eBPF. All drop reasons are defined in the enum 'skb_drop_reason', and they will be print as string in 'kfree_skb' tracepoint in format of 'reason: XXX'. ( Maybe the reasons should be defined in a uapi header file, so that user space can use them? ) Signed-off-by: Menglong Dong Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 642acb0d1646..ef0870abc791 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -305,6 +305,17 @@ struct sk_buff_head { struct sk_buff; +/* The reason of skb drop, which is used in kfree_skb_reason(). + * en...maybe they should be splited by group? + * + * Each item here should also be in 'TRACE_SKB_DROP_REASON', which is + * used to translate the reason to string. + */ +enum skb_drop_reason { + SKB_DROP_REASON_NOT_SPECIFIED, + SKB_DROP_REASON_MAX, +}; + /* To allow 64K frame to be packed as single skb without frag_list we * require 64K/PAGE_SIZE pages plus 1 additional page to allow for * buffers which do not start on a page boundary. @@ -1085,8 +1096,18 @@ static inline bool skb_unref(struct sk_buff *skb) return true; } +void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason); + +/** + * kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason + * @skb: buffer to free + */ +static inline void kfree_skb(struct sk_buff *skb) +{ + kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); +} + void skb_release_head_state(struct sk_buff *skb); -void kfree_skb(struct sk_buff *skb); void kfree_skb_list(struct sk_buff *segs); void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt); void skb_tx_error(struct sk_buff *skb); -- cgit v1.2.3 From 85125597419aec3aa7b8f3b8713e415f997796f2 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 9 Jan 2022 14:36:27 +0800 Subject: net: skb: use kfree_skb_reason() in tcp_v4_rcv() Replace kfree_skb() with kfree_skb_reason() in tcp_v4_rcv(). Following drop reasons are added: SKB_DROP_REASON_NO_SOCKET SKB_DROP_REASON_PKT_TOO_SMALL SKB_DROP_REASON_TCP_CSUM SKB_DROP_REASON_TCP_FILTER After this patch, 'kfree_skb' event will print message like this: $ TASK-PID CPU# ||||| TIMESTAMP FUNCTION $ | | | ||||| | | -0 [000] ..s1. 36.113438: kfree_skb: skbaddr=(____ptrval____) protocol=2048 location=(____ptrval____) reason: NO_SOCKET The reason of skb drop is printed too. Signed-off-by: Menglong Dong Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ef0870abc791..c9c97b0d0fe9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -313,6 +313,10 @@ struct sk_buff; */ enum skb_drop_reason { SKB_DROP_REASON_NOT_SPECIFIED, + SKB_DROP_REASON_NO_SOCKET, + SKB_DROP_REASON_PKT_TOO_SMALL, + SKB_DROP_REASON_TCP_CSUM, + SKB_DROP_REASON_TCP_FILTER, SKB_DROP_REASON_MAX, }; -- cgit v1.2.3 From 1c7fab70df085d866a3765955f397ca2b4025b15 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 9 Jan 2022 14:36:28 +0800 Subject: net: skb: use kfree_skb_reason() in __udp4_lib_rcv() Replace kfree_skb() with kfree_skb_reason() in __udp4_lib_rcv. New drop reason 'SKB_DROP_REASON_UDP_CSUM' is added for udp csum error. Signed-off-by: Menglong Dong Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c9c97b0d0fe9..af64c7de9b53 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -317,6 +317,7 @@ enum skb_drop_reason { SKB_DROP_REASON_PKT_TOO_SMALL, SKB_DROP_REASON_TCP_CSUM, SKB_DROP_REASON_TCP_FILTER, + SKB_DROP_REASON_UDP_CSUM, SKB_DROP_REASON_MAX, }; -- cgit v1.2.3 From a6b5a28eb56c3f4988f7ff5290b954ba296e309a Mon Sep 17 00:00:00 2001 From: Dave Wysochanski Date: Sat, 14 Nov 2020 13:43:54 -0500 Subject: nfs: Convert to new fscache volume/cookie API Change the nfs filesystem to support fscache's indexing rewrite and reenable caching in nfs. The following changes have been made: (1) The fscache_netfs struct is no more, and there's no need to register the filesystem as a whole. (2) The session cookie is now an fscache_volume cookie, allocated with fscache_acquire_volume(). That takes three parameters: a string representing the "volume" in the index, a string naming the cache to use (or NULL) and a u64 that conveys coherency metadata for the volume. For nfs, I've made it render the volume name string as: "nfs,,,
,,,*<,param>[,]" (3) The fscache_cookie_def is no more and needed information is passed directly to fscache_acquire_cookie(). The cache no longer calls back into the filesystem, but rather metadata changes are indicated at other times. fscache_acquire_cookie() is passed the same keying and coherency information as before. (4) fscache_enable/disable_cookie() have been removed. Call fscache_use_cookie() and fscache_unuse_cookie() when a file is opened or closed to prevent a cache file from being culled and to keep resources to hand that are needed to do I/O. If a file is opened for writing, we invalidate it with FSCACHE_INVAL_DIO_WRITE in lieu of doing writeback to the cache, thereby making it cease caching until all currently open files are closed. This should give the same behaviour as the uptream code. Making the cache store local modifications isn't straightforward for NFS, so that's left for future patches. (5) fscache_invalidate() now needs to be given uptodate auxiliary data and a file size. It also takes a flag to indicate if this was due to a DIO write. (6) Call nfs_fscache_invalidate() with FSCACHE_INVAL_DIO_WRITE on a file to which a DIO write is made. (7) Call fscache_note_page_release() from nfs_release_page(). (8) Use a killable wait in nfs_vm_page_mkwrite() when waiting for PG_fscache to be cleared. (9) The functions to read and write data to/from the cache are stubbed out pending a conversion to use netfslib. Changes ======= ver #3: - Added missing =n fallback for nfs_fscache_release_file()[1][2]. ver #2: - Use gfpflags_allow_blocking() rather than using flag directly. - fscache_acquire_volume() now returns errors. - Remove NFS_INO_FSCACHE as it's no longer used. - Need to unuse a cookie on file-release, not inode-clear. Signed-off-by: Dave Wysochanski Co-developed-by: David Howells Signed-off-by: David Howells Tested-by: Dave Wysochanski Acked-by: Jeff Layton cc: Trond Myklebust cc: Anna Schumaker cc: linux-nfs@vger.kernel.org cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/202112100804.nksO8K4u-lkp@intel.com/ [1] Link: https://lore.kernel.org/r/202112100957.2oEDT20W-lkp@intel.com/ [2] Link: https://lore.kernel.org/r/163819668938.215744.14448852181937731615.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906979003.143852.2601189243864854724.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967182112.1823006.7791504655391213379.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021575950.640689.12069642327533368467.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/nfs_fs.h | 1 - include/linux/nfs_fs_sb.h | 9 ++------- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 05f249f20f55..00835bacd236 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -275,7 +275,6 @@ struct nfs4_copy_state { #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ #define NFS_INO_INVALIDATING (3) /* inode is being invalidated */ #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ -#define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */ #define NFS_INO_FORCE_READDIR (7) /* force readdirplus */ #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ #define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 2a9acbfe00f0..77b2dba27bbb 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -120,11 +120,6 @@ struct nfs_client { * This is used to generate the mv0 callback address. */ char cl_ipaddr[48]; - -#ifdef CONFIG_NFS_FSCACHE - struct fscache_cookie *fscache; /* client index cache cookie */ -#endif - struct net *cl_net; struct list_head pending_cb_stateids; }; @@ -194,8 +189,8 @@ struct nfs_server { struct nfs_auth_info auth_info; /* parsed auth flavors */ #ifdef CONFIG_NFS_FSCACHE - struct nfs_fscache_key *fscache_key; /* unique key for superblock */ - struct fscache_cookie *fscache; /* superblock cookie */ + struct fscache_volume *fscache; /* superblock cookie */ + char *fscache_uniq; /* Uniquifier (or NULL) */ #endif u32 pnfs_blksize; /* layout_blksize attr */ -- cgit v1.2.3 From 16f2f4e679cfdaa9552574484f104014908a76c6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 27 Aug 2021 15:19:34 +0100 Subject: nfs: Implement cache I/O by accessing the cache directly Move NFS to using fscache DIO API instead of the old upstream I/O API as that has been removed. This is a stopgap solution as the intention is that at sometime in the future, the cache will move to using larger blocks and won't be able to store individual pages in order to deal with the potential for data corruption due to the backing filesystem being able insert/remove bridging blocks of zeros into its extent list[1]. NFS then reads and writes cache pages synchronously and one page at a time. The preferred change would be to use the netfs lib, but the new I/O API can be used directly. It's just that as the cache now needs to track data for itself, caching blocks may exceed page size... This code is somewhat borrowed from my "fallback I/O" patchset[2]. Changes ======= ver #3: - Restore lost =n fallback for nfs_fscache_release_page()[2]. Signed-off-by: David Howells Tested-by: Dave Wysochanski Acked-by: Jeff Layton cc: Trond Myklebust cc: Anna Schumaker cc: linux-nfs@vger.kernel.org cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/YO17ZNOcq+9PajfQ@mit.edu [1] Link: https://lore.kernel.org/r/202112100957.2oEDT20W-lkp@intel.com/ [2] Link: https://lore.kernel.org/r/163189108292.2509237.12615909591150927232.stgit@warthog.procyon.org.uk/ [2] Link: https://lore.kernel.org/r/163906981318.143852.17220018647843475985.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967184451.1823006.6450645559828329590.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021577632.640689.11069627070150063812.stgit@warthog.procyon.org.uk/ # v4 --- include/linux/fscache.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 7bd35f60d19a..ede50406bcb0 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -168,6 +168,7 @@ extern void __fscache_relinquish_cookie(struct fscache_cookie *, bool); extern void __fscache_resize_cookie(struct fscache_cookie *, loff_t); extern void __fscache_invalidate(struct fscache_cookie *, const void *, loff_t, unsigned int); extern int __fscache_begin_read_operation(struct netfs_cache_resources *, struct fscache_cookie *); +extern int __fscache_begin_write_operation(struct netfs_cache_resources *, struct fscache_cookie *); extern void __fscache_write_to_cache(struct fscache_cookie *, struct address_space *, loff_t, size_t, loff_t, netfs_io_terminated_t, void *, @@ -499,6 +500,33 @@ int fscache_read(struct netfs_cache_resources *cres, term_func, term_func_priv); } +/** + * fscache_begin_write_operation - Begin a write operation for the netfs lib + * @cres: The cache resources for the write being performed + * @cookie: The cookie representing the cache object + * + * Begin a write operation on behalf of the netfs helper library. @cres + * indicates the cache resources to which the operation state should be + * attached; @cookie indicates the cache object that will be accessed. + * + * @cres->inval_counter is set from @cookie->inval_counter for comparison at + * the end of the operation. This allows invalidation during the operation to + * be detected by the caller. + * + * Returns: + * * 0 - Success + * * -ENOBUFS - No caching available + * * Other error code from the cache, such as -ENOMEM. + */ +static inline +int fscache_begin_write_operation(struct netfs_cache_resources *cres, + struct fscache_cookie *cookie) +{ + if (fscache_cookie_enabled(cookie)) + return __fscache_begin_write_operation(cres, cookie); + return -ENOBUFS; +} + /** * fscache_write - Start a write to the cache. * @cres: The cache resources to use -- cgit v1.2.3 From a59466ee91aaa9d43889a4c51e01de087d188448 Mon Sep 17 00:00:00 2001 From: Karolina Drobnik Date: Tue, 11 Jan 2022 10:28:47 +0000 Subject: memblock: Remove #ifdef __KERNEL__ from memblock.h memblock.h is not a uAPI header, so __KERNEL__ guard can be deleted Signed-off-by: Karolina Drobnik Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/20220111102847.673746-1-karolinadrobnik@gmail.com --- include/linux/memblock.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 9dc7cb239d21..50ad19662a32 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -1,7 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _LINUX_MEMBLOCK_H #define _LINUX_MEMBLOCK_H -#ifdef __KERNEL__ /* * Logical memory blocks. @@ -605,6 +604,5 @@ static inline void early_memtest(phys_addr_t start, phys_addr_t end) } #endif -#endif /* __KERNEL__ */ #endif /* _LINUX_MEMBLOCK_H */ -- cgit v1.2.3 From 500b55b05d0a21c4adddf4c3b29ee6f32b502046 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 21 Dec 2021 10:45:07 -0600 Subject: PCI: Work around Intel I210 ROM BAR overlap defect Per PCIe r5, sec 7.5.1.2.4, a device must not claim accesses to its Expansion ROM unless both the Memory Space Enable and the Expansion ROM Enable bit are set. But apparently some Intel I210 NICs don't work correctly if the ROM BAR overlaps another BAR, even if the Expansion ROM is disabled. Michael reported that on a Kontron SMARC-sAL28 ARM64 system with U-Boot v2021.01-rc3, the ROM BAR overlaps BAR 3, and networking doesn't work at all: BAR 0: 0x40000000 (32-bit, non-prefetchable) [size=1M] BAR 3: 0x40200000 (32-bit, non-prefetchable) [size=16K] ROM: 0x40200000 (disabled) [size=1M] NETDEV WATCHDOG: enP2p1s0 (igb): transmit queue 0 timed out Hardware name: Kontron SMARC-sAL28 (Single PHY) on SMARC Eval 2.0 carrier (DT) igb 0002:01:00.0 enP2p1s0: Reset adapter Previously, pci_std_update_resource() wrote the assigned ROM address to the BAR only when the ROM was enabled. This meant that the I210 ROM BAR could be left with an address assigned by firmware, which might overlap with other BARs. Quirk these I210 devices so pci_std_update_resource() always writes the assigned address to the ROM BAR, whether or not the ROM is enabled. Link: https://lore.kernel.org/r/20211223163754.GA1267351@bhelgaas Link: https://lore.kernel.org/r/20201230185317.30915-1-michael@walle.cc Link: https://bugzilla.kernel.org/show_bug.cgi?id=211105 Reported-by: Michael Walle Tested-by: Michael Walle Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 18a75c8e615c..51c4a063f489 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -455,6 +455,7 @@ struct pci_dev { unsigned int link_active_reporting:1;/* Device capable of reporting link active */ unsigned int no_vf_scan:1; /* Don't scan for VFs after IOV enablement */ unsigned int no_command_memory:1; /* No PCI_COMMAND_MEMORY */ + unsigned int rom_bar_overlap:1; /* ROM BAR disable broken */ pci_dev_flags_t dev_flags; atomic_t enable_cnt; /* pci_enable_device has been called */ -- cgit v1.2.3 From 1d3cfc2835c1754d19a743dc346a9e58cf0c07c0 Mon Sep 17 00:00:00 2001 From: Kelvin Cao Date: Thu, 23 Dec 2021 17:23:33 -0800 Subject: ntb_hw_switchtec: Remove code for disabling ID protection ID protection is a firmware setting for NT window access control. With it enabled, only the posted requests with requester IDs in the requester ID table will be allowed to access the NT windows. Otherwise all posted requests are allowed. Normally user will configure it statically via the Switchtec config file, and it will take effect when the firmware boots up. The driver can also toggle the ID protection setting dynamically, which will overwrite the static setting in the Switchtec config file as a side effect. Currently, the driver disables the ID protection. However, it's not necessary to disable the ID protection at the driver level as the driver has already configured the proper requester IDs in the requester ID table to allow the corresponding posted requests to hit the NT windows. Remove the code that disables the ID protection to make the static setting prevail. Note: ID protection is not applicable to non-posted requests. Signed-off-by: Kelvin Cao Signed-off-by: Jon Mason --- include/linux/switchtec.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h index be24056ac00f..48fabe36509e 100644 --- a/include/linux/switchtec.h +++ b/include/linux/switchtec.h @@ -337,8 +337,6 @@ enum { NTB_CTRL_REQ_ID_EN = 1 << 0, NTB_CTRL_LUT_EN = 1 << 0, - - NTB_PART_CTRL_ID_PROT_DIS = 1 << 0, }; struct ntb_ctrl_regs { -- cgit v1.2.3 From ca321ec74322e3c49552fc1ffc80b42d0dbf1a84 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 8 Jan 2022 15:06:57 +0100 Subject: module.h: allow #define strings to work with MODULE_IMPORT_NS The MODULE_IMPORT_NS() macro does not allow defined strings to work properly with it, so add a layer of indirection to allow this to happen. Cc: Luis Chamberlain Cc: Jessica Yu Cc: Matthias Maennich Signed-off-by: Greg Kroah-Hartman Reviewed-by: Matthias Maennich Signed-off-by: Luis Chamberlain --- include/linux/module.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index c9f1200b2312..f4338235ed2c 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -290,7 +290,8 @@ extern typeof(name) __mod_##type##__##name##_device_table \ * files require multiple MODULE_FIRMWARE() specifiers */ #define MODULE_FIRMWARE(_firmware) MODULE_INFO(firmware, _firmware) -#define MODULE_IMPORT_NS(ns) MODULE_INFO(import_ns, #ns) +#define _MODULE_IMPORT_NS(ns) MODULE_INFO(import_ns, #ns) +#define MODULE_IMPORT_NS(ns) _MODULE_IMPORT_NS(ns) struct notifier_block; -- cgit v1.2.3 From 3f4b32511a77bc5a05cfbf26fec94c4e1b1cf46a Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 7 Jan 2022 18:17:18 +0000 Subject: PM: core: Remove DEFINE_UNIVERSAL_DEV_PM_OPS() macro The deprecated UNIVERSAL_DEV_PM_OPS() macro uses the provided callbacks for both runtime PM and system sleep, which is very likely to be a mistake, as a system sleep can be triggered while a given device is already PM-suspended, which would cause the suspend callback to be called twice. The amount of users of UNIVERSAL_DEV_PM_OPS() is also tiny (16 occurences) compared to the number of places where SET_SYSTEM_SLEEP_PM_OPS() is used with pm_runtime_force_suspend() and pm_runtime_force_resume(), which makes me think that none of these cases are actually valid. As the new macro DEFINE_UNIVERSAL_DEV_PM_OPS() which was introduced to replace UNIVERSAL_DEV_PM_OPS() is currently unused, remove it before someone starts to use it in yet another invalid case. Signed-off-by: Paul Cercueil Acked-by: Jonathan Cameron Reviewed-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index e1e9402180b9..02f059d814bb 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -366,6 +366,12 @@ static const struct dev_pm_ops name = { \ SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ } +/* Deprecated. Use DEFINE_SIMPLE_DEV_PM_OPS() instead. */ +#define SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ +const struct dev_pm_ops __maybe_unused name = { \ + SET_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ +} + /* * Use this for defining a set of PM operations to be used in all situations * (system suspend, hibernation or runtime PM). @@ -378,20 +384,9 @@ static const struct dev_pm_ops name = { \ * suspend and "early" resume callback pointers, .suspend_late() and * .resume_early(), to the same routines as .runtime_suspend() and * .runtime_resume(), respectively (and analogously for hibernation). + * + * Deprecated. You most likely don't want this macro. */ -#define DEFINE_UNIVERSAL_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ -static const struct dev_pm_ops name = { \ - SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ - RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ -} - -/* Deprecated. Use DEFINE_SIMPLE_DEV_PM_OPS() instead. */ -#define SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ -const struct dev_pm_ops __maybe_unused name = { \ - SET_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ -} - -/* Deprecated. Use DEFINE_UNIVERSAL_DEV_PM_OPS() instead. */ #define UNIVERSAL_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ const struct dev_pm_ops __maybe_unused name = { \ SET_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ -- cgit v1.2.3 From 52cc1d7f9786d2be44a3ab9b5b48416a7618e713 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 7 Jan 2022 18:17:19 +0000 Subject: PM: core: Remove static qualifier in DEFINE_SIMPLE_DEV_PM_OPS macro Keep this macro in line with the other ones. This makes it possible to use them in the cases where the underlying dev_pm_ops structure is exported. Restore the "static" qualifier in the two drivers where the DEFINE_SIMPLE_DEV_PM_OPS macro was used. Signed-off-by: Paul Cercueil Acked-by: Jonathan Cameron Reviewed-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index 02f059d814bb..8e13387e70ec 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -362,7 +362,7 @@ struct dev_pm_ops { * to RAM and hibernation. */ #define DEFINE_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ -static const struct dev_pm_ops name = { \ +const struct dev_pm_ops name = { \ SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ } -- cgit v1.2.3 From 0ae101fdd3297b7165755340e05386f1e1379709 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 7 Jan 2022 18:17:20 +0000 Subject: PM: core: Add EXPORT[_GPL]_SIMPLE_DEV_PM_OPS macros These macros are defined conditionally, according to CONFIG_PM: - if CONFIG_PM is enabled, these macros resolve to DEFINE_SIMPLE_DEV_PM_OPS(), and the dev_pm_ops symbol will be exported. - if CONFIG_PM is disabled, these macros will result in a dummy static dev_pm_ops to be created with the __maybe_unused flag. The dev_pm_ops will then be discarded by the compiler, along with the provided callback functions if they are not used anywhere else. In the second case, the symbol is not exported, which should be perfectly fine - users of the symbol should all use the pm_ptr() or pm_sleep_ptr() macro, so the dev_pm_ops marked as "extern" in the client's code will never be accessed. Signed-off-by: Paul Cercueil Acked-by: Jonathan Cameron Reviewed-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index 8e13387e70ec..8279af2c538a 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -8,6 +8,7 @@ #ifndef _LINUX_PM_H #define _LINUX_PM_H +#include #include #include #include @@ -357,14 +358,42 @@ struct dev_pm_ops { #define SET_RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) #endif +#define _DEFINE_DEV_PM_OPS(name, \ + suspend_fn, resume_fn, \ + runtime_suspend_fn, runtime_resume_fn, idle_fn) \ +const struct dev_pm_ops name = { \ + SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ + RUNTIME_PM_OPS(runtime_suspend_fn, runtime_resume_fn, idle_fn) \ +} + +#ifdef CONFIG_PM +#define _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \ + runtime_resume_fn, idle_fn, sec) \ + _DEFINE_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \ + runtime_resume_fn, idle_fn); \ + _EXPORT_SYMBOL(name, sec) +#else +#define _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \ + runtime_resume_fn, idle_fn, sec) \ +static __maybe_unused _DEFINE_DEV_PM_OPS(__static_##name, suspend_fn, \ + resume_fn, runtime_suspend_fn, \ + runtime_resume_fn, idle_fn) +#endif + /* * Use this if you want to use the same suspend and resume callbacks for suspend * to RAM and hibernation. + * + * If the underlying dev_pm_ops struct symbol has to be exported, use + * EXPORT_SIMPLE_DEV_PM_OPS() or EXPORT_GPL_SIMPLE_DEV_PM_OPS() instead. */ #define DEFINE_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ -const struct dev_pm_ops name = { \ - SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \ -} + _DEFINE_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL) + +#define EXPORT_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ + _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "") +#define EXPORT_GPL_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ + _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "_gpl") /* Deprecated. Use DEFINE_SIMPLE_DEV_PM_OPS() instead. */ #define SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ -- cgit v1.2.3 From 9d8619190031af0a314bee865262d8975473e4dd Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 7 Jan 2022 18:17:21 +0000 Subject: PM: runtime: Add DEFINE_RUNTIME_DEV_PM_OPS() macro A lot of drivers create a dev_pm_ops struct with the system sleep suspend/resume callbacks set to pm_runtime_force_suspend() and pm_runtime_force_resume(). These drivers can now use the DEFINE_RUNTIME_DEV_PM_OPS() macro, which will use pm_runtime_force_{suspend,resume}() as the system sleep callbacks, while having the same dead code removal characteristic that is already provided by DEFINE_SIMPLE_DEV_PM_OPS(). Signed-off-by: Paul Cercueil Acked-by: Jonathan Cameron Reviewed-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 3 ++- include/linux/pm_runtime.h | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index 8279af2c538a..f7d2be686359 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -414,7 +414,8 @@ const struct dev_pm_ops __maybe_unused name = { \ * .resume_early(), to the same routines as .runtime_suspend() and * .runtime_resume(), respectively (and analogously for hibernation). * - * Deprecated. You most likely don't want this macro. + * Deprecated. You most likely don't want this macro. Use + * DEFINE_RUNTIME_DEV_PM_OPS() instead. */ #define UNIVERSAL_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ const struct dev_pm_ops __maybe_unused name = { \ diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 016de5776b6d..4af454d29281 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -22,6 +22,20 @@ usage_count */ #define RPM_AUTO 0x08 /* Use autosuspend_delay */ +/* + * Use this for defining a set of PM operations to be used in all situations + * (system suspend, hibernation or runtime PM). + * + * Note that the behaviour differs from the deprecated UNIVERSAL_DEV_PM_OPS() + * macro, which uses the provided callbacks for both runtime PM and system + * sleep, while DEFINE_RUNTIME_DEV_PM_OPS() uses pm_runtime_force_suspend() + * and pm_runtime_force_resume() for its system sleep callbacks. + */ +#define DEFINE_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ + _DEFINE_DEV_PM_OPS(name, pm_runtime_force_suspend, \ + pm_runtime_force_resume, suspend_fn, \ + resume_fn, idle_fn) + #ifdef CONFIG_PM extern struct workqueue_struct *pm_wq; -- cgit v1.2.3 From d59ff7d9d84b03d22c5107f794e28fc8e1fce3a6 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 7 Jan 2022 18:17:22 +0000 Subject: PM: runtime: Add EXPORT[_GPL]_RUNTIME_DEV_PM_OPS macros Similar to EXPORT[_GPL]_SIMPLE_DEV_PM_OPS, but for users with runtime-PM suspend/resume callbacks. Signed-off-by: Paul Cercueil Acked-by: Jonathan Cameron Reviewed-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- include/linux/pm_runtime.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 4af454d29281..9f09601c465a 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -30,12 +30,22 @@ * macro, which uses the provided callbacks for both runtime PM and system * sleep, while DEFINE_RUNTIME_DEV_PM_OPS() uses pm_runtime_force_suspend() * and pm_runtime_force_resume() for its system sleep callbacks. + * + * If the underlying dev_pm_ops struct symbol has to be exported, use + * EXPORT_RUNTIME_DEV_PM_OPS() or EXPORT_GPL_RUNTIME_DEV_PM_OPS() instead. */ #define DEFINE_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ _DEFINE_DEV_PM_OPS(name, pm_runtime_force_suspend, \ pm_runtime_force_resume, suspend_fn, \ resume_fn, idle_fn) +#define EXPORT_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ + _EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \ + suspend_fn, resume_fn, idle_fn, "") +#define EXPORT_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ + _EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \ + suspend_fn, resume_fn, idle_fn, "_gpl") + #ifdef CONFIG_PM extern struct workqueue_struct *pm_wq; -- cgit v1.2.3 From 2d7c86a8f9cdce1408c4f3c69d94d007eff2f179 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:50 +0530 Subject: libceph: generalize addr/ip parsing based on delimiter ... and remove hardcoded function name in ceph_parse_ips(). [ idryomov: delim parameter, drop CEPH_ADDR_PARSE_DEFAULT_DELIM ] Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 2 +- include/linux/ceph/messenger.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 409d8c29bc4f..c72285d8594e 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -301,7 +301,7 @@ struct fs_parameter; struct fc_log; struct ceph_options *ceph_alloc_options(void); int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt, - struct fc_log *l); + struct fc_log *l, char delim); int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, struct fc_log *l); int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 0e6e9ad3c3bf..ff99ce094cfa 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -532,7 +532,7 @@ extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr); extern int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, - int max_count, int *count); + int max_count, int *count, char delim); extern int ceph_msgr_init(void); extern void ceph_msgr_exit(void); -- cgit v1.2.3 From 4153c7fc937a2afa077dbdb9fe3189b9981f423c Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:51 +0530 Subject: libceph: rename parse_fsid() to ceph_parse_fsid() and export ... as it is too generic. also, use __func__ when logging rather than hardcoding the function name. Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index c72285d8594e..644f224eccf7 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -296,6 +296,7 @@ extern bool libceph_compatible(void *data); extern const char *ceph_msg_type_name(int type); extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); extern void *ceph_kvmalloc(size_t size, gfp_t flags); +extern int ceph_parse_fsid(const char *str, struct ceph_fsid *fsid); struct fs_parameter; struct fc_log; -- cgit v1.2.3 From 180dccb0dba4f5e84a4a70c1be1d34cbb6528b32 Mon Sep 17 00:00:00 2001 From: Laibin Qiu Date: Thu, 13 Jan 2022 10:55:36 +0800 Subject: blk-mq: fix tag_get wait task can't be awakened In case of shared tags, there might be more than one hctx which allocates from the same tags, and each hctx is limited to allocate at most: hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U); tag idle detection is lazy, and may be delayed for 30sec, so there could be just one real active hctx(queue) but all others are actually idle and still accounted as active because of the lazy idle detection. Then if wake_batch is > hctx_max_depth, driver tag allocation may wait forever on this real active hctx. Fix this by recalculating wake_batch when inc or dec active_queues. Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps") Suggested-by: Ming Lei Suggested-by: John Garry Signed-off-by: Laibin Qiu Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220113025536.1479653-1-qiulaibin@huawei.com Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index fc0357a6e19b..95df357ec009 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -415,6 +415,17 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq) sbitmap_free(&sbq->sb); } +/** + * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch + * @sbq: Bitmap queue to recalculate wake batch. + * @users: Number of shares. + * + * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch + * by depth. This interface is for HCTX shared tags or queue shared tags. + */ +void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, + unsigned int users); + /** * sbitmap_queue_resize() - Resize a &struct sbitmap_queue. * @sbq: Bitmap queue to resize. -- cgit v1.2.3 From 289e7b0f7eb47b87a0441e6c81336316f301eb39 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 13 Dec 2021 11:08:53 +0100 Subject: tracing: Account bottom half disabled sections. Disabling only bottom halves via local_bh_disable() disables also preemption but this remains invisible to tracing. On a CONFIG_PREEMPT kernel one might wonder why there is no scheduling happening despite the N flag in the trace. The reason might be the a rcu_read_lock_bh() section. Add a 'b' to the tracing output if in task context with disabled bottom halves. Link: https://lkml.kernel.org/r/YbcbtdtC/bjCKo57@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 3900404aa063..70c069aef02c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -172,6 +172,7 @@ enum trace_flag_type { TRACE_FLAG_SOFTIRQ = 0x10, TRACE_FLAG_PREEMPT_RESCHED = 0x20, TRACE_FLAG_NMI = 0x40, + TRACE_FLAG_BH_OFF = 0x80, }; #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT -- cgit v1.2.3 From 6840f9094f2bd788a316d8cb0a4e42538d3e47dd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Jan 2022 16:44:19 -0500 Subject: pagevec: Initialise folio_batch->percpu_pvec_drained When UBSAN is enabled, it reports an invalid value in __pagevec_release() when accessing pvec->percpu_pvec_drained, which is simply whatever garbage was on the stack. Initialise it when initialising the rest of the folio_batch. Fixes: 10331795fb79 ("pagevec: Add folio_batch") Reported-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/pagevec.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index dda8d5868c81..67b1246f136b 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -111,6 +111,7 @@ static_assert(offsetof(struct pagevec, pages) == static inline void folio_batch_init(struct folio_batch *fbatch) { fbatch->nr = 0; + fbatch->percpu_pvec_drained = false; } static inline unsigned int folio_batch_count(struct folio_batch *fbatch) -- cgit v1.2.3 From e5b48ee30aec1fe6dff05e36b22e886c665b4736 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 4 Jan 2022 16:14:46 +0900 Subject: ata: sata_fsl: fix scsi host initialization When compiling with W=1, the sata_fsl driver compilation throws the warning: drivers/ata/sata_fsl.c:1385:22: error: initialized field overwritten [-Werror=override-init] 1385 | .can_queue = SATA_FSL_QUEUE_DEPTH, This is due to the driver scsi host template initialization overwriting the can_queue field that is already set using the ATA_NCQ_SHT() initializer macro, resulting in the same field being initialized twice in the host template declaration. To remove this warning, introduce the ATA_SUBBASE_SHT_QD() and ATA_NCQ_SHT_QD() initialization macros to allow specifying a queue depth different from the default ATA_DEF_QUEUE using an additional argument to the macro. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke --- include/linux/libata.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index c258f69106f4..2e5e7c40c991 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1385,6 +1385,12 @@ extern const struct attribute_group *ata_common_sdev_groups[]; .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ .slave_configure = ata_scsi_slave_config +#define ATA_SUBBASE_SHT_QD(drv_name, drv_qd) \ + __ATA_BASE_SHT(drv_name), \ + .can_queue = drv_qd, \ + .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ + .slave_configure = ata_scsi_slave_config + #define ATA_BASE_SHT(drv_name) \ ATA_SUBBASE_SHT(drv_name), \ .sdev_groups = ata_common_sdev_groups @@ -1396,6 +1402,11 @@ extern const struct attribute_group *ata_ncq_sdev_groups[]; ATA_SUBBASE_SHT(drv_name), \ .sdev_groups = ata_ncq_sdev_groups, \ .change_queue_depth = ata_scsi_change_queue_depth + +#define ATA_NCQ_SHT_QD(drv_name, drv_qd) \ + ATA_SUBBASE_SHT_QD(drv_name, drv_qd), \ + .sdev_groups = ata_ncq_sdev_groups, \ + .change_queue_depth = ata_scsi_change_queue_depth #endif /* -- cgit v1.2.3 From 0561e514c944da874ccdfbe2922f71b4c333c7e1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 4 Jan 2022 17:54:18 +0900 Subject: ata: fix read_id() ata port operation interface Drivers that need to tweak a device IDENTIFY data implement the read_id() port operation. The IDENTIFY data buffer is passed as an argument to the read_id() operation for drivers to use. However, when this operation is called, the IDENTIFY data is not yet converted to CPU endian and contains le16 words. Change the interface of the read_id operation to pass a __le16 * pointer to the IDENTIFY data buffer to clarify the buffer endianness. Fix the pata_netcell, pata_it821x, ahci_xgene, ahci_ceva and ahci_brcm drivers implementation of this operation and modify the code to corretly deal with identify data words manipulation to avoid sparse warnings such as: drivers/ata/ahci_xgene.c:262:33: warning: invalid assignment: &= drivers/ata/ahci_xgene.c:262:33: left side has type unsigned short drivers/ata/ahci_xgene.c:262:33: right side has type restricted __le16 Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke --- include/linux/libata.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 2e5e7c40c991..bf706cd45674 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -884,7 +884,8 @@ struct ata_port_operations { void (*set_piomode)(struct ata_port *ap, struct ata_device *dev); void (*set_dmamode)(struct ata_port *ap, struct ata_device *dev); int (*set_mode)(struct ata_link *link, struct ata_device **r_failed_dev); - unsigned int (*read_id)(struct ata_device *dev, struct ata_taskfile *tf, u16 *id); + unsigned int (*read_id)(struct ata_device *dev, struct ata_taskfile *tf, + __le16 *id); void (*dev_config)(struct ata_device *dev); @@ -1119,7 +1120,7 @@ extern void ata_id_string(const u16 *id, unsigned char *s, extern void ata_id_c_string(const u16 *id, unsigned char *s, unsigned int ofs, unsigned int len); extern unsigned int ata_do_dev_read_id(struct ata_device *dev, - struct ata_taskfile *tf, u16 *id); + struct ata_taskfile *tf, __le16 *id); extern void ata_qc_complete(struct ata_queued_cmd *qc); extern u64 ata_qc_get_active(struct ata_port *ap); extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd); -- cgit v1.2.3 From b9ba367c513dbc165dd6c01266a59db4be2a3564 Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Wed, 5 Jan 2022 16:36:16 +0100 Subject: ata: libata: Rename link flag ATA_LFLAG_NO_DB_DELAY Rename the link flag ATA_LFLAG_NO_DB_DELAY to ATA_LFLAG_NO_DEBOUNCE_DELAY. The new name is longer, but clearer. Signed-off-by: Paul Menzel Signed-off-by: Damien Le Moal --- include/linux/libata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index bf706cd45674..605756f645be 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -143,7 +143,7 @@ enum { ATA_LFLAG_NO_LPM = (1 << 8), /* disable LPM on this link */ ATA_LFLAG_RST_ONCE = (1 << 9), /* limit recovery to one reset */ ATA_LFLAG_CHANGED = (1 << 10), /* LPM state changed on this link */ - ATA_LFLAG_NO_DB_DELAY = (1 << 11), /* no debounce delay on link resume */ + ATA_LFLAG_NO_DEBOUNCE_DELAY = (1 << 11), /* no debounce delay on link resume */ /* struct ata_port flags */ ATA_FLAG_SLAVE_POSS = (1 << 0), /* host supports slave dev */ -- cgit v1.2.3 From d9679d0013a66849f23057978f92e76b255c50aa Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 13 Oct 2021 06:55:44 -0400 Subject: virtio: wrap config->reset calls This will enable cleanups down the road. The idea is to disable cbs, then add "flush_queued_cbs" callback as a parameter, this way drivers can flush any work queued after callbacks have been disabled. Signed-off-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20211013105226.20225-1-mst@redhat.com Signed-off-by: Michael S. Tsirkin --- include/linux/virtio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 41edbc01ffa4..72292a62cd90 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -138,6 +138,7 @@ int virtio_finalize_features(struct virtio_device *dev); int virtio_device_freeze(struct virtio_device *dev); int virtio_device_restore(struct virtio_device *dev); #endif +void virtio_reset_device(struct virtio_device *dev); size_t virtio_max_dma_size(struct virtio_device *vdev); -- cgit v1.2.3 From 539fec78edb4e084e7c532affc56cc42d4ceea4b Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 26 Nov 2021 17:47:53 +0100 Subject: vdpa: add driver_override support `driver_override` allows to control which of the vDPA bus drivers binds to a vDPA device. If `driver_override` is not set, the previous behaviour is followed: devices use the first vDPA bus driver loaded (unless auto binding is disabled). Tested on Fedora 34 with driverctl(8): $ modprobe virtio-vdpa $ modprobe vhost-vdpa $ modprobe vdpa-sim-net $ vdpa dev add mgmtdev vdpasim_net name dev1 # dev1 is attached to the first vDPA bus driver loaded $ driverctl -b vdpa list-devices dev1 virtio_vdpa $ driverctl -b vdpa set-override dev1 vhost_vdpa $ driverctl -b vdpa list-devices dev1 vhost_vdpa [*] Note: driverctl(8) integrates with udev so the binding is preserved. Suggested-by: Jason Wang Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20211126164753.181829-3-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index c3011ccda430..ae34015b37b7 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -64,6 +64,7 @@ struct vdpa_mgmt_dev; * struct vdpa_device - representation of a vDPA device * @dev: underlying device * @dma_dev: the actual device that is performing DMA + * @driver_override: driver name to force a match * @config: the configuration ops for this device. * @cf_mutex: Protects get and set access to configuration layout. * @index: device index @@ -76,6 +77,7 @@ struct vdpa_mgmt_dev; struct vdpa_device { struct device dev; struct device *dma_dev; + const char *driver_override; const struct vdpa_config_ops *config; struct mutex cf_mutex; /* Protects get/set config */ unsigned int index; -- cgit v1.2.3 From 28cc408be72cebb0f3fcc37bc74ab3196d4de726 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Thu, 4 Nov 2021 20:52:48 +0100 Subject: vdpa: Mark vdpa_config_ops.get_vq_notification as optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since vhost_vdpa_mmap checks for its existence before calling it. Signed-off-by: Eugenio Pérez Link: https://lore.kernel.org/r/20211104195248.2088904-1-eperezma@redhat.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Reviewed-by: Stefano Garzarella --- include/linux/vdpa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index ae34015b37b7..2b7db96bb7d3 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -157,7 +157,7 @@ struct vdpa_map_file { * @vdev: vdpa device * @idx: virtqueue index * @state: pointer to returned state (last_avail_idx) - * @get_vq_notification: Get the notification area for a virtqueue + * @get_vq_notification: Get the notification area for a virtqueue (optional) * @vdev: vdpa device * @idx: virtqueue index * Returns the notifcation area -- cgit v1.2.3 From a64917bc2e9b1e0aa716b783c4ec879fdd280300 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 5 Jan 2022 13:46:33 +0200 Subject: vdpa: Provide interface to read driver features Provide an interface to read the negotiated features. This is needed when building the netlink message in vdpa_dev_net_config_fill(). Also fix the implementation of vdpa_dev_net_config_fill() to use the negotiated features instead of the device features. To make APIs clearer, make the following name changes to struct vdpa_config_ops so they better describe their operations: get_features -> get_device_features set_features -> set_driver_features Finally, add get_driver_features to return the negotiated features and add implementation to all the upstream drivers. Acked-by: Jason Wang Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20220105114646.577224-2-elic@nvidia.com Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 2b7db96bb7d3..9cc4291a79b3 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -171,14 +171,17 @@ struct vdpa_map_file { * for the device * @vdev: vdpa device * Returns virtqueue algin requirement - * @get_features: Get virtio features supported by the device + * @get_device_features: Get virtio features supported by the device * @vdev: vdpa device * Returns the virtio features support by the * device - * @set_features: Set virtio features supported by the driver + * @set_driver_features: Set virtio features supported by the driver * @vdev: vdpa device * @features: feature support by the driver * Returns integer: success (0) or error (< 0) + * @get_driver_features: Get the virtio driver features in action + * @vdev: vdpa device + * Returns the virtio features accepted * @set_config_cb: Set the config interrupt callback * @vdev: vdpa device * @cb: virtio-vdev interrupt callback structure @@ -278,8 +281,9 @@ struct vdpa_config_ops { /* Device ops */ u32 (*get_vq_align)(struct vdpa_device *vdev); - u64 (*get_features)(struct vdpa_device *vdev); - int (*set_features)(struct vdpa_device *vdev, u64 features); + u64 (*get_device_features)(struct vdpa_device *vdev); + int (*set_driver_features)(struct vdpa_device *vdev, u64 features); + u64 (*get_driver_features)(struct vdpa_device *vdev); void (*set_config_cb)(struct vdpa_device *vdev, struct vdpa_callback *cb); u16 (*get_vq_num_max)(struct vdpa_device *vdev); @@ -397,7 +401,7 @@ static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features) const struct vdpa_config_ops *ops = vdev->config; vdev->features_valid = true; - return ops->set_features(vdev, features); + return ops->set_driver_features(vdev, features); } void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, -- cgit v1.2.3 From 73bc0dbb591baea322a7319c735e5f6c7dba9cfb Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 5 Jan 2022 13:46:35 +0200 Subject: vdpa: Sync calls set/get config/status with cf_mutex Add wrappers to get/set status and protect these operations with cf_mutex to serialize these operations with respect to get/set config operations. Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20220105114646.577224-4-elic@nvidia.com Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 9cc4291a79b3..ae047fae2603 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -408,6 +408,9 @@ void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len); void vdpa_set_config(struct vdpa_device *dev, unsigned int offset, const void *buf, unsigned int length); +u8 vdpa_get_status(struct vdpa_device *vdev); +void vdpa_set_status(struct vdpa_device *vdev, u8 status); + /** * struct vdpa_mgmtdev_ops - vdpa device ops * @dev_add: Add a vdpa device using alloc and register -- cgit v1.2.3 From aba21aff772b8622e08f07219069be793429a48f Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 5 Jan 2022 13:46:37 +0200 Subject: vdpa: Allow to configure max data virtqueues Add netlink support to configure the max virtqueue pairs for a device. At least one pair is required. The maximum is dictated by the device. Example: $ vdpa dev add name vdpa-a mgmtdev auxiliary/mlx5_core.sf.1 max_vqp 4 Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20220105114646.577224-6-elic@nvidia.com Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index ae047fae2603..6d4d7e4fe208 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -101,6 +101,7 @@ struct vdpa_dev_set_config { struct { u8 mac[ETH_ALEN]; u16 mtu; + u16 max_vq_pairs; } net; u64 mask; }; @@ -391,17 +392,29 @@ static inline struct device *vdpa_get_dma_dev(struct vdpa_device *vdev) static inline int vdpa_reset(struct vdpa_device *vdev) { const struct vdpa_config_ops *ops = vdev->config; + int ret; + mutex_lock(&vdev->cf_mutex); vdev->features_valid = false; - return ops->reset(vdev); + ret = ops->reset(vdev); + mutex_unlock(&vdev->cf_mutex); + return ret; } -static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features) +static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features, bool locked) { const struct vdpa_config_ops *ops = vdev->config; + int ret; + + if (!locked) + mutex_lock(&vdev->cf_mutex); vdev->features_valid = true; - return ops->set_driver_features(vdev, features); + ret = ops->set_driver_features(vdev, features); + if (!locked) + mutex_unlock(&vdev->cf_mutex); + + return ret; } void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, -- cgit v1.2.3 From cd2629f6df1cab5b3df34705ae7f3bde6147fce3 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 5 Jan 2022 13:46:42 +0200 Subject: vdpa: Support reporting max device capabilities Add max_supported_vqs and supported_features fields to struct vdpa_mgmt_dev. Upstream drivers need to feel these values according to the device capabilities. These values are reported back in a netlink message when showing management devices. Examples: $ auxiliary/mlx5_core.sf.1: supported_classes net max_supported_vqs 257 dev_features CSUM GUEST_CSUM MTU HOST_TSO4 HOST_TSO6 STATUS CTRL_VQ MQ \ CTRL_MAC_ADDR VERSION_1 ACCESS_PLATFORM $ vdpa -j mgmtdev show {"mgmtdev":{"auxiliary/mlx5_core.sf.1":{"supported_classes":["net"], \ "max_supported_vqs":257,"dev_features":["CSUM","GUEST_CSUM","MTU", \ "HOST_TSO4","HOST_TSO6","STATUS","CTRL_VQ","MQ","CTRL_MAC_ADDR", \ "VERSION_1","ACCESS_PLATFORM"]}}} $ vdpa -jp mgmtdev show { "mgmtdev": { "auxiliary/mlx5_core.sf.1": { "supported_classes": [ "net" ], "max_supported_vqs": 257, "dev_features": ["CSUM","GUEST_CSUM","MTU","HOST_TSO4", \ "HOST_TSO6","STATUS","CTRL_VQ","MQ", \ "CTRL_MAC_ADDR","VERSION_1","ACCESS_PLATFORM"] } } } Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20220105114646.577224-11-elic@nvidia.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Si-Wei Liu --- include/linux/vdpa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 6d4d7e4fe208..a6047fd6cf12 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -460,6 +460,8 @@ struct vdpa_mgmt_dev { const struct virtio_device_id *id_table; u64 config_attr_mask; struct list_head list; + u64 supported_features; + u32 max_supported_vqs; }; int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev); -- cgit v1.2.3 From f6d955d80830b6e6f6a170be68cc3628f36365dd Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Tue, 11 Jan 2022 20:33:57 +0200 Subject: vdpa: Avoid taking cf_mutex lock on get status Avoid the wrapper holding cf_mutex since it is not protecting anything. To avoid confusion and unnecessary overhead incurred by it, remove. Fixes: f489f27bc0ab ("vdpa: Sync calls set/get config/status with cf_mutex") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20220111183400.38418-2-elic@nvidia.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Si-Wei Liu Acked-by: Jason Wang --- include/linux/vdpa.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index a6047fd6cf12..2de442ececae 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -421,7 +421,6 @@ void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len); void vdpa_set_config(struct vdpa_device *dev, unsigned int offset, const void *buf, unsigned int length); -u8 vdpa_get_status(struct vdpa_device *vdev); void vdpa_set_status(struct vdpa_device *vdev, u8 status); /** -- cgit v1.2.3 From 800977f6f32e452cba6b04ef21d2f5383ca29209 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:02:52 -0800 Subject: kthread: add the helper function kthread_run_on_cpu() Add a new helper function kthread_run_on_cpu(), which includes kthread_create_on_cpu/wake_up_process(). In some cases, use kthread_run_on_cpu() directly instead of kthread_create_on_node/kthread_bind/wake_up_process() or kthread_create_on_cpu/wake_up_process() or kthreadd_create/kthread_bind/wake_up_process() to simplify the code. [akpm@linux-foundation.org: export kthread_create_on_cpu to modules] Link: https://lkml.kernel.org/r/20211022025711.3673-2-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Cai Huoqing Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kthread.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 346b0f269161..db47aae7c481 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -56,6 +56,31 @@ bool kthread_is_per_cpu(struct task_struct *k); __k; \ }) +/** + * kthread_run_on_cpu - create and wake a cpu bound thread. + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @cpu: The cpu on which the thread should be bound, + * @namefmt: printf-style name for the thread. Format is restricted + * to "name.*%u". Code fills in cpu number. + * + * Description: Convenient wrapper for kthread_create_on_cpu() + * followed by wake_up_process(). Returns the kthread or + * ERR_PTR(-ENOMEM). + */ +static inline struct task_struct * +kthread_run_on_cpu(int (*threadfn)(void *data), void *data, + unsigned int cpu, const char *namefmt) +{ + struct task_struct *p; + + p = kthread_create_on_cpu(threadfn, data, cpu, namefmt); + if (!IS_ERR(p)) + wake_up_process(p); + + return p; +} + void free_kthread_struct(struct task_struct *k); void kthread_bind(struct task_struct *k, unsigned int cpu); void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask); -- cgit v1.2.3 From 60115fa54ad7b913b7cb5844e6b7ffeb842d55f2 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 14 Jan 2022 14:04:11 -0800 Subject: mm: defer kmemleak object creation of module_alloc() Yongqiang reports a kmemleak panic when module insmod/rmmod with KASAN enabled(without KASAN_VMALLOC) on x86[1]. When the module area allocates memory, it's kmemleak_object is created successfully, but the KASAN shadow memory of module allocation is not ready, so when kmemleak scan the module's pointer, it will panic due to no shadow memory with KASAN check. module_alloc __vmalloc_node_range kmemleak_vmalloc kmemleak_scan update_checksum kasan_module_alloc kmemleak_ignore Note, there is no problem if KASAN_VMALLOC enabled, the modules area entire shadow memory is preallocated. Thus, the bug only exits on ARCH which supports dynamic allocation of module area per module load, for now, only x86/arm64/s390 are involved. Add a VM_DEFER_KMEMLEAK flags, defer vmalloc'ed object register of kmemleak in module_alloc() to fix this issue. [1] https://lore.kernel.org/all/6d41e2b9-4692-5ec4-b1cd-cbe29ae89739@huawei.com/ [wangkefeng.wang@huawei.com: fix build] Link: https://lkml.kernel.org/r/20211125080307.27225-1-wangkefeng.wang@huawei.com [akpm@linux-foundation.org: simplify ifdefs, per Andrey] Link: https://lkml.kernel.org/r/CA+fCnZcnwJHUQq34VuRxpdoY6_XbJCDJ-jopksS5Eia4PijPzw@mail.gmail.com Link: https://lkml.kernel.org/r/20211124142034.192078-1-wangkefeng.wang@huawei.com Fixes: 793213a82de4 ("s390/kasan: dynamic shadow mem allocation for modules") Fixes: 39d114ddc682 ("arm64: add KASAN support") Fixes: bebf56a1b176 ("kasan: enable instrumentation of global variables") Signed-off-by: Kefeng Wang Reported-by: Yongqiang Liu Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Catalin Marinas Cc: Will Deacon Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Alexander Gordeev Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Alexander Potapenko Cc: Kefeng Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 4 ++-- include/linux/vmalloc.h | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d8783b682669..89c99e5e67de 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -474,12 +474,12 @@ static inline void kasan_populate_early_vm_area_shadow(void *start, * allocations with real shadow memory. With KASAN vmalloc, the special * case is unnecessary, as the work is handled in the generic case. */ -int kasan_module_alloc(void *addr, size_t size); +int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask); void kasan_free_shadow(const struct vm_struct *vm); #else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ -static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } +static inline int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { return 0; } static inline void kasan_free_shadow(const struct vm_struct *vm) {} #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 6e022cc712e6..880227b9f044 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -28,6 +28,13 @@ struct notifier_block; /* in notifier.h */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ #define VM_NO_HUGE_VMAP 0x00000400 /* force PAGE_SIZE pte mapping */ +#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ + !defined(CONFIG_KASAN_VMALLOC) +#define VM_DEFER_KMEMLEAK 0x00000800 /* defer kmemleak object creation */ +#else +#define VM_DEFER_KMEMLEAK 0 +#endif + /* * VM_KASAN is used slightly differently depending on CONFIG_KASAN_VMALLOC. * -- cgit v1.2.3 From c4386bd8ee3a921c3c799b7197dc898ade76a453 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:22 -0800 Subject: mm/memremap: add ZONE_DEVICE support for compound pages Add a new @vmemmap_shift property for struct dev_pagemap which specifies that a devmap is composed of a set of compound pages of order @vmemmap_shift, instead of base pages. When a compound page devmap is requested, all but the first page are initialised as tail pages instead of order-0 pages. For certain ZONE_DEVICE users like device-dax which have a fixed page size, this creates an opportunity to optimize GUP and GUP-fast walkers, treating it the same way as THP or hugetlb pages. Additionally, commit 7118fc2906e2 ("hugetlb: address ref count racing in prep_compound_gigantic_page") removed set_page_count() because the setting of page ref count to zero was redundant. devmap pages don't come from page allocator though and only head page refcount is used for compound pages, hence initialize tail page count to zero. Link: https://lkml.kernel.org/r/20211202204422.26777-5-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Dan Williams Cc: Christoph Hellwig Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memremap.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index c0e9d35889e8..61a6a0e27359 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -99,6 +99,11 @@ struct dev_pagemap_ops { * @done: completion for @internal_ref * @type: memory type: see MEMORY_* in memory_hotplug.h * @flags: PGMAP_* flags to specify defailed behavior + * @vmemmap_shift: structural definition of how the vmemmap page metadata + * is populated, specifically the metadata page order. + * A zero value (default) uses base pages as the vmemmap metadata + * representation. A bigger value will set up compound struct pages + * of the requested order value. * @ops: method table * @owner: an opaque pointer identifying the entity that manages this * instance. Used by various helpers to make sure that no @@ -114,6 +119,7 @@ struct dev_pagemap { struct completion done; enum memory_type type; unsigned int flags; + unsigned long vmemmap_shift; const struct dev_pagemap_ops *ops; void *owner; int nr_range; @@ -130,6 +136,11 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) return NULL; } +static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) +{ + return 1 << pgmap->vmemmap_shift; +} + #ifdef CONFIG_ZONE_DEVICE void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); -- cgit v1.2.3 From 3e9d80a891df3b1a5d77db47fa7fdf33ba71e5cb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Jan 2022 14:05:04 -0800 Subject: mm,fs: split dump_mapping() out from dump_page() dump_mapping() is a big chunk of dump_page(), and it'd be handy to be able to call it when we don't have a struct page. Split it out and move it to fs/inode.c. Take the opportunity to simplify some of the debug messages a little. Link: https://lkml.kernel.org/r/20211121121056.2870061-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Acked-by: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index bbf812ce89a8..5315fa68f751 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3152,6 +3152,7 @@ extern void unlock_new_inode(struct inode *); extern void discard_new_inode(struct inode *); extern unsigned int get_next_ino(void); extern void evict_inodes(struct super_block *sb); +void dump_mapping(const struct address_space *); /* * Userspace may rely on the the inode number being non-zero. For example, glibc -- cgit v1.2.3 From b6bf9abb0aa44e53ffe9c1e6e1d32568f5b25e4a Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Fri, 14 Jan 2022 14:05:35 -0800 Subject: mm/memcg: add oom_group_kill memory event Our container agent wants to know when a container exits if it was OOM killed or not to report to the user. We use memory.oom.group = 1 to ensure that OOM kills within the container's cgroup kill everything. Existing memory.events are insufficient for knowing if this triggered: 1) Our current approach reads memory.events oom_kill and reports the container was killed if the value is non-zero. This is erroneous in some cases where containers create their children cgroups with memory.oom.group=1 as such OOM kills will get counted against the container cgroup's oom_kill counter despite not actually OOM killing the entire container. 2) Reading memory.events.local will fail to identify OOM kills in leaf cgroups (that don't set memory.oom.group) within the container cgroup. This patch adds a new oom_group_kill event when memory.oom.group triggers to allow userspace to cleanly identify when an entire cgroup is oom killed. [schatzberg.dan@gmail.com: changes from Johannes and Chris] Link: https://lkml.kernel.org/r/20211213162511.2492267-1-schatzberg.dan@gmail.com Link: https://lkml.kernel.org/r/20211203162426.3375036-1-schatzberg.dan@gmail.com Signed-off-by: Dan Schatzberg Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Chris Down Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Tejun Heo Cc: Zefan Li Cc: Jonathan Corbet Cc: Vladimir Davydov Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Alex Shi Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0c5c403f4be6..951f24f42147 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -42,6 +42,7 @@ enum memcg_memory_event { MEMCG_MAX, MEMCG_OOM, MEMCG_OOM_KILL, + MEMCG_OOM_GROUP_KILL, MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, -- cgit v1.2.3 From 4e5aa1f4c2b489bc6f3ab5ca54747b18a847289d Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 14 Jan 2022 14:05:45 -0800 Subject: memcg: add per-memcg vmalloc stat The kvmalloc* allocation functions can fallback to vmalloc allocations and more often on long running machines. In addition the kernel does have __GFP_ACCOUNT kvmalloc* calls. So, often on long running machines, the memory.stat does not tell the complete picture which type of memory is charged to the memcg. So add a per-memcg vmalloc stat. [shakeelb@google.com: page_memcg() within rcu lock, per Muchun] Link: https://lkml.kernel.org/r/20211222052457.1960701-1-shakeelb@google.com [akpm@linux-foundation.org: remove cast, per Muchun] [shakeelb@google.com: remove area->page[0] checks and move to page by page accounting per Michal] Link: https://lkml.kernel.org/r/20220104222341.3972772-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20211221215336.1922823-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Muchun Song Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 951f24f42147..0131e5574c88 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -33,6 +33,7 @@ enum memcg_stat_item { MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, MEMCG_PERCPU_B, + MEMCG_VMALLOC, MEMCG_NR_STAT, }; @@ -992,6 +993,21 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, local_irq_restore(flags); } +static inline void mod_memcg_page_state(struct page *page, + int idx, int val) +{ + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return; + + rcu_read_lock(); + memcg = page_memcg(page); + if (memcg) + mod_memcg_state(memcg, idx, val); + rcu_read_unlock(); +} + static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { return READ_ONCE(memcg->vmstats.state[idx]); @@ -1447,6 +1463,11 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, { } +static inline void mod_memcg_page_state(struct page *page, + int idx, int val) +{ +} + static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { return 0; -- cgit v1.2.3 From 9a10064f5625d5572c3626c1516e0bebc6c9fe9b Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Fri, 14 Jan 2022 14:05:59 -0800 Subject: mm: add a field to store names for private anonymous memory In many userspace applications, and especially in VM based applications like Android uses heavily, there are multiple different allocators in use. At a minimum there is libc malloc and the stack, and in many cases there are libc malloc, the stack, direct syscalls to mmap anonymous memory, and multiple VM heaps (one for small objects, one for big objects, etc.). Each of these layers usually has its own tools to inspect its usage; malloc by compiling a debug version, the VM through heap inspection tools, and for direct syscalls there is usually no way to track them. On Android we heavily use a set of tools that use an extended version of the logic covered in Documentation/vm/pagemap.txt to walk all pages mapped in userspace and slice their usage by process, shared (COW) vs. unique mappings, backing, etc. This can account for real physical memory usage even in cases like fork without exec (which Android uses heavily to share as many private COW pages as possible between processes), Kernel SamePage Merging, and clean zero pages. It produces a measurement of the pages that only exist in that process (USS, for unique), and a measurement of the physical memory usage of that process with the cost of shared pages being evenly split between processes that share them (PSS). If all anonymous memory is indistinguishable then figuring out the real physical memory usage (PSS) of each heap requires either a pagemap walking tool that can understand the heap debugging of every layer, or for every layer's heap debugging tools to implement the pagemap walking logic, in which case it is hard to get a consistent view of memory across the whole system. Tracking the information in userspace leads to all sorts of problems. It either needs to be stored inside the process, which means every process has to have an API to export its current heap information upon request, or it has to be stored externally in a filesystem that somebody needs to clean up on crashes. It needs to be readable while the process is still running, so it has to have some sort of synchronization with every layer of userspace. Efficiently tracking the ranges requires reimplementing something like the kernel vma trees, and linking to it from every layer of userspace. It requires more memory, more syscalls, more runtime cost, and more complexity to separately track regions that the kernel is already tracking. This patch adds a field to /proc/pid/maps and /proc/pid/smaps to show a userspace-provided name for anonymous vmas. The names of named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps as [anon:]. Userspace can set the name for a region of memory by calling prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name) Setting the name to NULL clears it. The name length limit is 80 bytes including NUL-terminator and is checked to contain only printable ascii characters (including space), except '[',']','\','$' and '`'. Ascii strings are being used to have a descriptive identifiers for vmas, which can be understood by the users reading /proc/pid/maps or /proc/pid/smaps. Names can be standardized for a given system and they can include some variable parts such as the name of the allocator or a library, tid of the thread using it, etc. The name is stored in a pointer in the shared union in vm_area_struct that points to a null terminated string. Anonymous vmas with the same name (equivalent strings) and are otherwise mergeable will be merged. The name pointers are not shared between vmas even if they contain the same name. The name pointer is stored in a union with fields that are only used on file-backed mappings, so it does not increase memory usage. CONFIG_ANON_VMA_NAME kernel configuration is introduced to enable this feature. It keeps the feature disabled by default to prevent any additional memory overhead and to avoid confusing procfs parsers on systems which are not ready to support named anonymous vmas. The patch is based on the original patch developed by Colin Cross, more specifically on its latest version [1] posted upstream by Sumit Semwal. It used a userspace pointer to store vma names. In that design, name pointers could be shared between vmas. However during the last upstreaming attempt, Kees Cook raised concerns [2] about this approach and suggested to copy the name into kernel memory space, perform validity checks [3] and store as a string referenced from vm_area_struct. One big concern is about fork() performance which would need to strdup anonymous vma names. Dave Hansen suggested experimenting with worst-case scenario of forking a process with 64k vmas having longest possible names [4]. I ran this experiment on an ARM64 Android device and recorded a worst-case regression of almost 40% when forking such a process. This regression is addressed in the followup patch which replaces the pointer to a name with a refcounted structure that allows sharing the name pointer between vmas of the same name. Instead of duplicating the string during fork() or when splitting a vma it increments the refcount. [1] https://lore.kernel.org/linux-mm/20200901161459.11772-4-sumit.semwal@linaro.org/ [2] https://lore.kernel.org/linux-mm/202009031031.D32EF57ED@keescook/ [3] https://lore.kernel.org/linux-mm/202009031022.3834F692@keescook/ [4] https://lore.kernel.org/linux-mm/5d0358ab-8c47-2f5f-8e43-23b89d6a8e95@intel.com/ Changes for prctl(2) manual page (in the options section): PR_SET_VMA Sets an attribute specified in arg2 for virtual memory areas starting from the address specified in arg3 and spanning the size specified in arg4. arg5 specifies the value of the attribute to be set. Note that assigning an attribute to a virtual memory area might prevent it from being merged with adjacent virtual memory areas due to the difference in that attribute's value. Currently, arg2 must be one of: PR_SET_VMA_ANON_NAME Set a name for anonymous virtual memory areas. arg5 should be a pointer to a null-terminated string containing the name. The name length including null byte cannot exceed 80 bytes. If arg5 is NULL, the name of the appropriate anonymous virtual memory areas will be reset. The name can contain only printable ascii characters (including space), except '[',']','\','$' and '`'. This feature is available only if the kernel is built with the CONFIG_ANON_VMA_NAME option enabled. [surenb@google.com: docs: proc.rst: /proc/PID/maps: fix malformed table] Link: https://lkml.kernel.org/r/20211123185928.2513763-1-surenb@google.com [surenb: rebased over v5.15-rc6, replaced userpointer with a kernel copy, added input sanitization and CONFIG_ANON_VMA_NAME config. The bulk of the work here was done by Colin Cross, therefore, with his permission, keeping him as the author] Link: https://lkml.kernel.org/r/20211019215511.3771969-2-surenb@google.com Signed-off-by: Colin Cross Signed-off-by: Suren Baghdasaryan Reviewed-by: Kees Cook Cc: Stephen Rothwell Cc: Al Viro Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: David Rientjes Cc: "Eric W. Biederman" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Glauber Cc: Johannes Weiner Cc: John Stultz Cc: Mel Gorman Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Rob Landley Cc: "Serge E. Hallyn" Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 13 +++++++++- include/linux/mm_types.h | 64 +++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 72 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a7e4a9e7d807..7000442984b9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2658,7 +2658,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx); + struct mempolicy *, struct vm_userfaultfd_ctx, const char *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); @@ -3391,5 +3391,16 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) return 0; } +#ifdef CONFIG_ANON_VMA_NAME +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name); +#else +static inline int +madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name) { + return 0; +} +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c3a6e6209600..799e2ee626b2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -426,11 +426,19 @@ struct vm_area_struct { /* * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. + * + * For private anonymous mappings, a pointer to a null terminated string + * containing the name given to the vma, or NULL if unnamed. */ - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; + + union { + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; + /* Serialized by mmap_sem. */ + char *anon_name; + }; /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma @@ -875,4 +883,52 @@ typedef struct { unsigned long val; } swp_entry_t; +#ifdef CONFIG_ANON_VMA_NAME +/* + * mmap_lock should be read-locked when calling vma_anon_name() and while using + * the returned pointer. + */ +extern const char *vma_anon_name(struct vm_area_struct *vma); + +/* + * mmap_lock should be read-locked for orig_vma->vm_mm. + * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be + * isolated. + */ +extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma); + +/* + * mmap_lock should be write-locked or vma should have been isolated under + * write-locked mmap_lock protection. + */ +extern void free_vma_anon_name(struct vm_area_struct *vma); + +/* mmap_lock should be read-locked */ +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + const char *vma_name = vma_anon_name(vma); + + /* either both NULL, or pointers to same string */ + if (vma_name == name) + return true; + + return name && vma_name && !strcmp(name, vma_name); +} +#else /* CONFIG_ANON_VMA_NAME */ +static inline const char *vma_anon_name(struct vm_area_struct *vma) +{ + return NULL; +} +static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_vma_anon_name(struct vm_area_struct *vma) {} +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + return true; +} +#endif /* CONFIG_ANON_VMA_NAME */ + #endif /* _LINUX_MM_TYPES_H */ -- cgit v1.2.3 From 78db3412833dc9c479cd17412035f216cfd01a29 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 14 Jan 2022 14:06:03 -0800 Subject: mm: add anonymous vma name refcounting While forking a process with high number (64K) of named anonymous vmas the overhead caused by strdup() is noticeable. Experiments with ARM64 Android device show up to 40% performance regression when forking a process with 64k unpopulated anonymous vmas using the max name lengths vs the same process with the same number of anonymous vmas having no name. Introduce anon_vma_name refcounted structure to avoid the overhead of copying vma names during fork() and when splitting named anonymous vmas. When a vma is duplicated, instead of copying the name we increment the refcount of this structure. Multiple vmas can point to the same anon_vma_name as long as they increment the refcount. The name member of anon_vma_name structure is assigned at structure allocation time and is never changed. If vma name changes then the refcount of the original structure is dropped, a new anon_vma_name structure is allocated to hold the new name and the vma pointer is updated to point to the new structure. With this approach the fork() performance regressions is reduced 3-4x times and with usecases using more reasonable number of VMAs (a few thousand) the regressions is not measurable. Link: https://lkml.kernel.org/r/20211019215511.3771969-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Kees Cook Cc: Al Viro Cc: Colin Cross Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: David Rientjes Cc: "Eric W. Biederman" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Glauber Cc: Johannes Weiner Cc: John Stultz Cc: Mel Gorman Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Rob Landley Cc: "Serge E. Hallyn" Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 799e2ee626b2..449b6eafc695 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -386,6 +387,12 @@ struct vm_userfaultfd_ctx { struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ +struct anon_vma_name { + struct kref kref; + /* The name needs to be at the end because it is dynamically sized. */ + char name[]; +}; + /* * This struct describes a virtual memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory @@ -437,7 +444,7 @@ struct vm_area_struct { unsigned long rb_subtree_last; } shared; /* Serialized by mmap_sem. */ - char *anon_name; + struct anon_vma_name *anon_name; }; /* -- cgit v1.2.3 From 17fca131cee21724ee953a17c185c14e9533af5b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 14 Jan 2022 14:06:07 -0800 Subject: mm: move anon_vma declarations to linux/mm_inline.h The patch to add anonymous vma names causes a build failure in some configurations: include/linux/mm_types.h: In function 'is_same_vma_anon_name': include/linux/mm_types.h:924:37: error: implicit declaration of function 'strcmp' [-Werror=implicit-function-declaration] 924 | return name && vma_name && !strcmp(name, vma_name); | ^~~~~~ include/linux/mm_types.h:22:1: note: 'strcmp' is defined in header ''; did you forget to '#include '? This should not really be part of linux/mm_types.h in the first place, as that header is meant to only contain structure defintions and need a minimum set of indirect includes itself. While the header clearly includes more than it should at this point, let's not make it worse by including string.h as well, which would pull in the expensive (compile-speed wise) fortify-string logic. Move the new functions into a separate header that only needs to be included in a couple of locations. Link: https://lkml.kernel.org/r/20211207125710.2503446-1-arnd@kernel.org Fixes: "mm: add a field to store names for private anonymous memory" Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Colin Cross Cc: Eric Biederman Cc: Kees Cook Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Stephen Rothwell Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 48 --------------------------------------------- 2 files changed, 50 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index e2ec68b0515c..47d96d2647ca 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -4,6 +4,7 @@ #include #include +#include /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? @@ -135,4 +136,53 @@ static __always_inline void del_page_from_lru_list(struct page *page, { lruvec_del_folio(lruvec, page_folio(page)); } + +#ifdef CONFIG_ANON_VMA_NAME +/* + * mmap_lock should be read-locked when calling vma_anon_name() and while using + * the returned pointer. + */ +extern const char *vma_anon_name(struct vm_area_struct *vma); + +/* + * mmap_lock should be read-locked for orig_vma->vm_mm. + * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be + * isolated. + */ +extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma); + +/* + * mmap_lock should be write-locked or vma should have been isolated under + * write-locked mmap_lock protection. + */ +extern void free_vma_anon_name(struct vm_area_struct *vma); + +/* mmap_lock should be read-locked */ +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + const char *vma_name = vma_anon_name(vma); + + /* either both NULL, or pointers to same string */ + if (vma_name == name) + return true; + + return name && vma_name && !strcmp(name, vma_name); +} +#else /* CONFIG_ANON_VMA_NAME */ +static inline const char *vma_anon_name(struct vm_area_struct *vma) +{ + return NULL; +} +static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_vma_anon_name(struct vm_area_struct *vma) {} +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + return true; +} +#endif /* CONFIG_ANON_VMA_NAME */ + #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 449b6eafc695..4d5fb84eed5e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -890,52 +890,4 @@ typedef struct { unsigned long val; } swp_entry_t; -#ifdef CONFIG_ANON_VMA_NAME -/* - * mmap_lock should be read-locked when calling vma_anon_name() and while using - * the returned pointer. - */ -extern const char *vma_anon_name(struct vm_area_struct *vma); - -/* - * mmap_lock should be read-locked for orig_vma->vm_mm. - * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be - * isolated. - */ -extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma); - -/* - * mmap_lock should be write-locked or vma should have been isolated under - * write-locked mmap_lock protection. - */ -extern void free_vma_anon_name(struct vm_area_struct *vma); - -/* mmap_lock should be read-locked */ -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) -{ - const char *vma_name = vma_anon_name(vma); - - /* either both NULL, or pointers to same string */ - if (vma_name == name) - return true; - - return name && vma_name && !strcmp(name, vma_name); -} -#else /* CONFIG_ANON_VMA_NAME */ -static inline const char *vma_anon_name(struct vm_area_struct *vma) -{ - return NULL; -} -static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma) {} -static inline void free_vma_anon_name(struct vm_area_struct *vma) {} -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) -{ - return true; -} -#endif /* CONFIG_ANON_VMA_NAME */ - #endif /* _LINUX_MM_TYPES_H */ -- cgit v1.2.3 From 36090def7bad06a6346f86a7cfdbfda2d138cb64 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 14 Jan 2022 14:06:10 -0800 Subject: mm: move tlb_flush_pending inline helpers to mm_inline.h linux/mm_types.h should only define structure definitions, to make it cheap to include elsewhere. The atomic_t helper function definitions are particularly large, so it's better to move the helpers using those into the existing linux/mm_inline.h and only include that where needed. As a follow-up, we may want to go through all the indirect includes in mm_types.h and reduce them as much as possible. Link: https://lkml.kernel.org/r/20211207125710.2503446-2-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Stephen Rothwell Cc: Suren Baghdasaryan Cc: Colin Cross Cc: Kees Cook Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Yu Zhao Cc: Vlastimil Babka Cc: Matthew Wilcox (Oracle) Cc: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 45 ---------------- include/linux/mm_inline.h | 86 +++++++++++++++++++++++++++++++ include/linux/mm_types.h | 129 ++++++++++++++++------------------------------ 3 files changed, 131 insertions(+), 129 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7000442984b9..c17e5cfc1e47 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -424,51 +424,6 @@ extern unsigned int kobjsize(const void *objp); */ extern pgprot_t protection_map[16]; -/** - * enum fault_flag - Fault flag definitions. - * @FAULT_FLAG_WRITE: Fault was a write fault. - * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. - * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. - * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. - * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. - * @FAULT_FLAG_TRIED: The fault has been tried once. - * @FAULT_FLAG_USER: The fault originated in userspace. - * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. - * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. - * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. - * - * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify - * whether we would allow page faults to retry by specifying these two - * fault flags correctly. Currently there can be three legal combinations: - * - * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and - * this is the first try - * - * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and - * we've already tried at least once - * - * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry - * - * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never - * be used. Note that page faults can be allowed to retry for multiple times, - * in which case we'll have an initial fault with flags (a) then later on - * continuous faults with flags (b). We should always try to detect pending - * signals before a retry to make sure the continuous page faults can still be - * interrupted if necessary. - */ -enum fault_flag { - FAULT_FLAG_WRITE = 1 << 0, - FAULT_FLAG_MKWRITE = 1 << 1, - FAULT_FLAG_ALLOW_RETRY = 1 << 2, - FAULT_FLAG_RETRY_NOWAIT = 1 << 3, - FAULT_FLAG_KILLABLE = 1 << 4, - FAULT_FLAG_TRIED = 1 << 5, - FAULT_FLAG_USER = 1 << 6, - FAULT_FLAG_REMOTE = 1 << 7, - FAULT_FLAG_INSTRUCTION = 1 << 8, - FAULT_FLAG_INTERRUPTIBLE = 1 << 9, -}; - /* * The default fault flags that should be used by most of the * arch-specific page fault handlers. diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 47d96d2647ca..b725839dfe71 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -2,6 +2,7 @@ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H +#include #include #include #include @@ -185,4 +186,89 @@ static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, } #endif /* CONFIG_ANON_VMA_NAME */ +static inline void init_tlb_flush_pending(struct mm_struct *mm) +{ + atomic_set(&mm->tlb_flush_pending, 0); +} + +static inline void inc_tlb_flush_pending(struct mm_struct *mm) +{ + atomic_inc(&mm->tlb_flush_pending); + /* + * The only time this value is relevant is when there are indeed pages + * to flush. And we'll only flush pages after changing them, which + * requires the PTL. + * + * So the ordering here is: + * + * atomic_inc(&mm->tlb_flush_pending); + * spin_lock(&ptl); + * ... + * set_pte_at(); + * spin_unlock(&ptl); + * + * spin_lock(&ptl) + * mm_tlb_flush_pending(); + * .... + * spin_unlock(&ptl); + * + * flush_tlb_range(); + * atomic_dec(&mm->tlb_flush_pending); + * + * Where the increment if constrained by the PTL unlock, it thus + * ensures that the increment is visible if the PTE modification is + * visible. After all, if there is no PTE modification, nobody cares + * about TLB flushes either. + * + * This very much relies on users (mm_tlb_flush_pending() and + * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and + * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc + * locks (PPC) the unlock of one doesn't order against the lock of + * another PTL. + * + * The decrement is ordered by the flush_tlb_range(), such that + * mm_tlb_flush_pending() will not return false unless all flushes have + * completed. + */ +} + +static inline void dec_tlb_flush_pending(struct mm_struct *mm) +{ + /* + * See inc_tlb_flush_pending(). + * + * This cannot be smp_mb__before_atomic() because smp_mb() simply does + * not order against TLB invalidate completion, which is what we need. + * + * Therefore we must rely on tlb_flush_*() to guarantee order. + */ + atomic_dec(&mm->tlb_flush_pending); +} + +static inline bool mm_tlb_flush_pending(struct mm_struct *mm) +{ + /* + * Must be called after having acquired the PTL; orders against that + * PTLs release and therefore ensures that if we observe the modified + * PTE we must also observe the increment from inc_tlb_flush_pending(). + * + * That is, it only guarantees to return true if there is a flush + * pending for _this_ PTL. + */ + return atomic_read(&mm->tlb_flush_pending); +} + +static inline bool mm_tlb_flush_nested(struct mm_struct *mm) +{ + /* + * Similar to mm_tlb_flush_pending(), we must have acquired the PTL + * for which there is a TLB flush pending in order to guarantee + * we've seen both that PTE modification and the increment. + * + * (no requirement on actually still holding the PTL, that is irrelevant) + */ + return atomic_read(&mm->tlb_flush_pending) > 1; +} + + #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4d5fb84eed5e..6a89f128c990 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -692,90 +692,6 @@ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_finish_mmu(struct mmu_gather *tlb); -static inline void init_tlb_flush_pending(struct mm_struct *mm) -{ - atomic_set(&mm->tlb_flush_pending, 0); -} - -static inline void inc_tlb_flush_pending(struct mm_struct *mm) -{ - atomic_inc(&mm->tlb_flush_pending); - /* - * The only time this value is relevant is when there are indeed pages - * to flush. And we'll only flush pages after changing them, which - * requires the PTL. - * - * So the ordering here is: - * - * atomic_inc(&mm->tlb_flush_pending); - * spin_lock(&ptl); - * ... - * set_pte_at(); - * spin_unlock(&ptl); - * - * spin_lock(&ptl) - * mm_tlb_flush_pending(); - * .... - * spin_unlock(&ptl); - * - * flush_tlb_range(); - * atomic_dec(&mm->tlb_flush_pending); - * - * Where the increment if constrained by the PTL unlock, it thus - * ensures that the increment is visible if the PTE modification is - * visible. After all, if there is no PTE modification, nobody cares - * about TLB flushes either. - * - * This very much relies on users (mm_tlb_flush_pending() and - * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and - * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc - * locks (PPC) the unlock of one doesn't order against the lock of - * another PTL. - * - * The decrement is ordered by the flush_tlb_range(), such that - * mm_tlb_flush_pending() will not return false unless all flushes have - * completed. - */ -} - -static inline void dec_tlb_flush_pending(struct mm_struct *mm) -{ - /* - * See inc_tlb_flush_pending(). - * - * This cannot be smp_mb__before_atomic() because smp_mb() simply does - * not order against TLB invalidate completion, which is what we need. - * - * Therefore we must rely on tlb_flush_*() to guarantee order. - */ - atomic_dec(&mm->tlb_flush_pending); -} - -static inline bool mm_tlb_flush_pending(struct mm_struct *mm) -{ - /* - * Must be called after having acquired the PTL; orders against that - * PTLs release and therefore ensures that if we observe the modified - * PTE we must also observe the increment from inc_tlb_flush_pending(). - * - * That is, it only guarantees to return true if there is a flush - * pending for _this_ PTL. - */ - return atomic_read(&mm->tlb_flush_pending); -} - -static inline bool mm_tlb_flush_nested(struct mm_struct *mm) -{ - /* - * Similar to mm_tlb_flush_pending(), we must have acquired the PTL - * for which there is a TLB flush pending in order to guarantee - * we've seen both that PTE modification and the increment. - * - * (no requirement on actually still holding the PTL, that is irrelevant) - */ - return atomic_read(&mm->tlb_flush_pending) > 1; -} - struct vm_fault; /** @@ -890,4 +806,49 @@ typedef struct { unsigned long val; } swp_entry_t; +/** + * enum fault_flag - Fault flag definitions. + * @FAULT_FLAG_WRITE: Fault was a write fault. + * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. + * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. + * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. + * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. + * @FAULT_FLAG_TRIED: The fault has been tried once. + * @FAULT_FLAG_USER: The fault originated in userspace. + * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. + * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. + * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. + * + * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify + * whether we would allow page faults to retry by specifying these two + * fault flags correctly. Currently there can be three legal combinations: + * + * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and + * this is the first try + * + * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and + * we've already tried at least once + * + * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry + * + * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never + * be used. Note that page faults can be allowed to retry for multiple times, + * in which case we'll have an initial fault with flags (a) then later on + * continuous faults with flags (b). We should always try to detect pending + * signals before a retry to make sure the continuous page faults can still be + * interrupted if necessary. + */ +enum fault_flag { + FAULT_FLAG_WRITE = 1 << 0, + FAULT_FLAG_MKWRITE = 1 << 1, + FAULT_FLAG_ALLOW_RETRY = 1 << 2, + FAULT_FLAG_RETRY_NOWAIT = 1 << 3, + FAULT_FLAG_KILLABLE = 1 << 4, + FAULT_FLAG_TRIED = 1 << 5, + FAULT_FLAG_USER = 1 << 6, + FAULT_FLAG_REMOTE = 1 << 7, + FAULT_FLAG_INSTRUCTION = 1 << 8, + FAULT_FLAG_INTERRUPTIBLE = 1 << 9, +}; + #endif /* _LINUX_MM_TYPES_H */ -- cgit v1.2.3 From cc6dcfee72509868271d42919a3c1081b6b0dc7e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 14 Jan 2022 14:06:18 -0800 Subject: mm: document locking restrictions for vm_operations_struct::close Add comments for vm_operations_struct::close documenting locking requirements for this callback and its callers. Link: https://lkml.kernel.org/r/20211209191325.3069345-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Christian Brauner Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Rientjes Cc: Florian Weimer Cc: Jan Engelhardt Cc: Jann Horn Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Minchan Kim Cc: Oleg Nesterov Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tim Murray Cc: Jason Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c17e5cfc1e47..4d7245e6802a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -532,6 +532,10 @@ enum page_entry_size { */ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); + /** + * @close: Called when the VMA is being removed from the MM. + * Context: User context. May sleep. Caller holds mmap_lock. + */ void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsigned long addr); -- cgit v1.2.3 From 08d5b29eac7dd5e6c79b66d390ecbb9219e05931 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Jan 2022 14:06:33 -0800 Subject: mm: ptep_clear() page table helper We have ptep_get_and_clear() and ptep_get_and_clear_full() helpers to clear PTE from user page tables, but there is no variant for simple clear of a present PTE from user page tables without using a low level pte_clear() which can be either native or para-virtualised. Add a new ptep_clear() that can be used in common code to clear PTEs from page table. We will need this call later in order to add a hook for page table check. Link: https://lkml.kernel.org/r/20211221154650.1047963-3-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: Aneesh Kumar K.V Cc: Dave Hansen Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Kees Cook Cc: Masahiro Yamada Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Peter Zijlstra Cc: Sami Tolvanen Cc: Thomas Gleixner Cc: Wei Xu Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pgtable.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e24d2c992b11..bc8713a76e03 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -258,6 +258,14 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif +#ifndef __HAVE_ARCH_PTEP_CLEAR +static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pte_clear(mm, addr, ptep); +} +#endif + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, -- cgit v1.2.3 From df4e817b710809425d899340dbfa8504a3ca4ba5 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Jan 2022 14:06:37 -0800 Subject: mm: page table check Check user page table entries at the time they are added and removed. Allows to synchronously catch memory corruption issues related to double mapping. When a pte for an anonymous page is added into page table, we verify that this pte does not already point to a file backed page, and vice versa if this is a file backed page that is being added we verify that this page does not have an anonymous mapping We also enforce that read-only sharing for anonymous pages is allowed (i.e. cow after fork). All other sharing must be for file pages. Page table check allows to protect and debug cases where "struct page" metadata became corrupted for some reason. For example, when refcnt or mapcount become invalid. Link: https://lkml.kernel.org/r/20211221154650.1047963-4-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: Aneesh Kumar K.V Cc: Dave Hansen Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Kees Cook Cc: Masahiro Yamada Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Peter Zijlstra Cc: Sami Tolvanen Cc: Thomas Gleixner Cc: Wei Xu Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_table_check.h | 147 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 include/linux/page_table_check.h (limited to 'include/linux') diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h new file mode 100644 index 000000000000..38cace1da7b6 --- /dev/null +++ b/include/linux/page_table_check.h @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2021, Google LLC. + * Pasha Tatashin + */ +#ifndef __LINUX_PAGE_TABLE_CHECK_H +#define __LINUX_PAGE_TABLE_CHECK_H + +#ifdef CONFIG_PAGE_TABLE_CHECK +#include + +extern struct static_key_true page_table_check_disabled; +extern struct page_ext_operations page_table_check_ops; + +void __page_table_check_zero(struct page *page, unsigned int order); +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte); +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd); +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud); +void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); +void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd); +void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud); + +static inline void page_table_check_alloc(struct page *page, unsigned int order) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_zero(page, order); +} + +static inline void page_table_check_free(struct page *page, unsigned int order) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_zero(page, order); +} + +static inline void page_table_check_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t pte) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pte_clear(mm, addr, pte); +} + +static inline void page_table_check_pmd_clear(struct mm_struct *mm, + unsigned long addr, pmd_t pmd) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pmd_clear(mm, addr, pmd); +} + +static inline void page_table_check_pud_clear(struct mm_struct *mm, + unsigned long addr, pud_t pud) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pud_clear(mm, addr, pud); +} + +static inline void page_table_check_pte_set(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + pte_t pte) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pte_set(mm, addr, ptep, pte); +} + +static inline void page_table_check_pmd_set(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp, + pmd_t pmd) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pmd_set(mm, addr, pmdp, pmd); +} + +static inline void page_table_check_pud_set(struct mm_struct *mm, + unsigned long addr, pud_t *pudp, + pud_t pud) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pud_set(mm, addr, pudp, pud); +} + +#else + +static inline void page_table_check_alloc(struct page *page, unsigned int order) +{ +} + +static inline void page_table_check_free(struct page *page, unsigned int order) +{ +} + +static inline void page_table_check_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t pte) +{ +} + +static inline void page_table_check_pmd_clear(struct mm_struct *mm, + unsigned long addr, pmd_t pmd) +{ +} + +static inline void page_table_check_pud_clear(struct mm_struct *mm, + unsigned long addr, pud_t pud) +{ +} + +static inline void page_table_check_pte_set(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + pte_t pte) +{ +} + +static inline void page_table_check_pmd_set(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp, + pmd_t pmd) +{ +} + +static inline void page_table_check_pud_set(struct mm_struct *mm, + unsigned long addr, pud_t *pudp, + pud_t pud) +{ +} + +#endif /* CONFIG_PAGE_TABLE_CHECK */ +#endif /* __LINUX_PAGE_TABLE_CHECK_H */ -- cgit v1.2.3 From 020e87650af9f43683546729f959fdc78422a4b7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Jan 2022 14:06:44 -0800 Subject: mm: remove last argument of reuse_swap_page() None of the callers care about the total_map_swapcount() any more. Link: https://lkml.kernel.org/r/20211220205943.456187-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Linus Torvalds Reviewed-by: William Kucharski Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index d1ea44b31f19..bdccbf1efa61 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -514,7 +514,7 @@ extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); -extern bool reuse_swap_page(struct page *, int *); +extern bool reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); @@ -680,8 +680,8 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -#define reuse_swap_page(page, total_map_swapcount) \ - (page_trans_huge_mapcount(page, total_map_swapcount) == 1) +#define reuse_swap_page(page) \ + (page_trans_huge_mapcount(page, NULL) == 1) static inline int try_to_free_swap(struct page *page) { -- cgit v1.2.3 From d08d2b62510e2407cf939e693aefd179dc114913 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Jan 2022 14:06:51 -0800 Subject: mm: remove the total_mapcount argument from page_trans_huge_mapcount() All callers pass NULL, so we can stop calculating the value we would store in it. Link: https://lkml.kernel.org/r/20211220205943.456187-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Acked-by: Linus Torvalds Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 10 +++------- include/linux/swap.h | 2 +- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4d7245e6802a..cef65f9cbdf2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -799,19 +799,15 @@ static inline int page_mapcount(struct page *page) #ifdef CONFIG_TRANSPARENT_HUGEPAGE int total_mapcount(struct page *page); -int page_trans_huge_mapcount(struct page *page, int *total_mapcount); +int page_trans_huge_mapcount(struct page *page); #else static inline int total_mapcount(struct page *page) { return page_mapcount(page); } -static inline int page_trans_huge_mapcount(struct page *page, - int *total_mapcount) +static inline int page_trans_huge_mapcount(struct page *page) { - int mapcount = page_mapcount(page); - if (total_mapcount) - *total_mapcount = mapcount; - return mapcount; + return page_mapcount(page); } #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index bdccbf1efa61..1d38d9475c4d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -681,7 +681,7 @@ static inline int swp_swapcount(swp_entry_t entry) } #define reuse_swap_page(page) \ - (page_trans_huge_mapcount(page, NULL) == 1) + (page_trans_huge_mapcount(page) == 1) static inline int try_to_free_swap(struct page *page) { -- cgit v1.2.3 From a421ef303008b0ceee2cfc625c3246fa7654b0ca Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 14 Jan 2022 14:07:07 -0800 Subject: mm: allow !GFP_KERNEL allocations for kvmalloc Support for GFP_NO{FS,IO} and __GFP_NOFAIL has been implemented by previous patches so we can allow the support for kvmalloc. This will allow some external users to simplify or completely remove their helpers. GFP_NOWAIT semantic hasn't been supported so far but it hasn't been explicitly documented so let's add a note about that. ceph_kvmalloc is the first helper to be dropped and changed to kvmalloc. Link: https://lkml.kernel.org/r/20211122153233.9924-5-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Uladzislau Rezki (Sony) Acked-by: Vlastimil Babka Cc: Christoph Hellwig Cc: Dave Chinner Cc: Ilya Dryomov Cc: Jeff Layton Cc: Neil Brown Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ceph/libceph.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 409d8c29bc4f..309acbcb5a8a 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -295,7 +295,6 @@ extern bool libceph_compatible(void *data); extern const char *ceph_msg_type_name(int type); extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); -extern void *ceph_kvmalloc(size_t size, gfp_t flags); struct fs_parameter; struct fc_log; -- cgit v1.2.3 From 4034247a0d6ab281ba3293798ce67af494d86129 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2022 14:07:14 -0800 Subject: mm: introduce memalloc_retry_wait() Various places in the kernel - largely in filesystems - respond to a memory allocation failure by looping around and re-trying. Some of these cannot conveniently use __GFP_NOFAIL, for reasons such as: - a GFP_ATOMIC allocation, which __GFP_NOFAIL doesn't work on - a need to check for the process being signalled between failures - the possibility that other recovery actions could be performed - the allocation is quite deep in support code, and passing down an extra flag to say if __GFP_NOFAIL is wanted would be clumsy. Many of these currently use congestion_wait() which (in almost all cases) simply waits the given timeout - congestion isn't tracked for most devices. It isn't clear what the best delay is for loops, but it is clear that the various filesystems shouldn't be responsible for choosing a timeout. This patch introduces memalloc_retry_wait() with takes on that responsibility. Code that wants to retry a memory allocation can call this function passing the GFP flags that were used. It will wait however is appropriate. For now, it only considers __GFP_NORETRY and whatever gfpflags_allow_blocking() tests. If blocking is allowed without __GFP_NORETRY, then alloc_page either made some reclaim progress, or waited for a while, before failing. So there is no need for much further waiting. memalloc_retry_wait() will wait until the current jiffie ends. If this condition is not met, then alloc_page() won't have waited much if at all. In that case memalloc_retry_wait() waits about 200ms. This is the delay that most current loops uses. linux/sched/mm.h needs to be included in some files now, but linux/backing-dev.h does not. Link: https://lkml.kernel.org/r/163754371968.13692.1277530886009912421@noble.neil.brown.name Signed-off-by: NeilBrown Cc: Dave Chinner Cc: Michal Hocko Cc: "Theodore Ts'o" Cc: Jaegeuk Kim Cc: Chao Yu Cc: Darrick J. Wong Cc: Chuck Lever Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index aca874d33fe6..aa5f09ca5bcf 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -214,6 +214,32 @@ static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif +/* Any memory-allocation retry loop should use + * memalloc_retry_wait(), and pass the flags for the most + * constrained allocation attempt that might have failed. + * This provides useful documentation of where loops are, + * and a central place to fine tune the waiting as the MM + * implementation changes. + */ +static inline void memalloc_retry_wait(gfp_t gfp_flags) +{ + /* We use io_schedule_timeout because waiting for memory + * typically included waiting for dirty pages to be + * written out, which requires IO. + */ + __set_current_state(TASK_UNINTERRUPTIBLE); + gfp_flags = current_gfp_context(gfp_flags); + if (gfpflags_allow_blocking(gfp_flags) && + !(gfp_flags & __GFP_NORETRY)) + /* Probably waited already, no need for much more */ + io_schedule_timeout(1); + else + /* Probably didn't wait, and has now released a lock, + * so now is a good time to wait + */ + io_schedule_timeout(HZ/50); +} + /** * might_alloc - Mark possible allocation sites * @gfp_mask: gfp_t flags that would be used to allocate -- cgit v1.2.3 From 1611f74a94ba2e0f2d25b75008ed8e76e122097a Mon Sep 17 00:00:00 2001 From: Changcheng Deng Date: Fri, 14 Jan 2022 14:07:21 -0800 Subject: mm: fix boolreturn.cocci warning Return statements in functions returning bool should use true/false instead of 1/0. Link: https://lkml.kernel.org/r/20211126073327.74815-1-deng.changcheng@zte.com.cn Signed-off-by: Changcheng Deng Reported-by: Zeal Robot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b5f14d581113..18423c2157e8 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -383,7 +383,7 @@ static __always_inline int TestClearPage##uname(struct page *page) \ TESTCLEARFLAG(uname, lname, policy) #define TESTPAGEFLAG_FALSE(uname, lname) \ -static inline bool folio_test_##lname(const struct folio *folio) { return 0; } \ +static inline bool folio_test_##lname(const struct folio *folio) { return false; } \ static inline int Page##uname(const struct page *page) { return 0; } #define SETPAGEFLAG_NOOP(uname, lname) \ -- cgit v1.2.3 From be1a13eb51077b2ec5f7f4306f93dfece503a3f1 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 14 Jan 2022 14:07:27 -0800 Subject: mm: drop node from alloc_pages_vma alloc_pages_vma is meant to allocate a page with a vma specific memory policy. The initial node parameter is always a local node so it is pointless to waste a function argument for this. Drop the parameter. Link: https://lkml.kernel.org/r/YaSnlv4QpryEpesG@dhcp22.suse.cz Signed-off-by: Michal Hocko Cc: Aneesh Kumar K.V Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: "Huang, Ying" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 8fcc38467af6..78b58448f796 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -598,9 +598,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned int order); struct folio *folio_alloc(gfp_t gfp, unsigned order); extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, - int node, bool hugepage); + bool hugepage); #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ - alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) + alloc_pages_vma(gfp_mask, order, vma, addr, true) #else static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { @@ -610,14 +610,14 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); } -#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ +#define alloc_pages_vma(gfp_mask, order, vma, addr, false)\ alloc_pages(gfp_mask, order) #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) + alloc_pages_vma(gfp_mask, 0, vma, addr, false) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); -- cgit v1.2.3 From 04a536bfbd0f885338eecc2a4503dfca50ac94dd Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Fri, 14 Jan 2022 14:07:30 -0800 Subject: include/linux/gfp.h: further document GFP_DMA32 kmalloc(..., GFP_DMA32) does not return DMA32 memory because the DMA32 kmalloc cache array is not implemented. (Reason: there is no such user in kernel). Put a short comment about this so people can understand this by reading the comment. [1] https://lists.linuxfoundation.org/pipermail/iommu/2018-December/031696.html Link: https://lkml.kernel.org/r/20211207093610.6406-1-miles.chen@mediatek.com Signed-off-by: Miles Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 78b58448f796..80f63c862be5 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -302,7 +302,9 @@ struct vm_area_struct; * lowest zone as a type of emergency reserve. * * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit - * address. + * address. Note that kmalloc(..., GFP_DMA32) does not return DMA32 memory + * because the DMA32 kmalloc cache array is not implemented. + * (Reason: there is no such user in kernel). * * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace, * do not need to be directly accessible by the kernel but that cannot -- cgit v1.2.3 From 62b3107073646e0946bd97ff926832bafb846d17 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 14 Jan 2022 14:07:37 -0800 Subject: mm_zone: add function to check if managed dma zone exists Patch series "Handle warning of allocation failure on DMA zone w/o managed pages", v4. **Problem observed: On x86_64, when crash is triggered and entering into kdump kernel, page allocation failure can always be seen. --------------------------------- DMA: preallocated 128 KiB GFP_KERNEL pool for atomic allocations swapper/0: page allocation failure: order:5, mode:0xcc1(GFP_KERNEL|GFP_DMA), nodemask=(null),cpuset=/,mems_allowed=0 CPU: 0 PID: 1 Comm: swapper/0 Call Trace: dump_stack+0x7f/0xa1 warn_alloc.cold+0x72/0xd6 ...... __alloc_pages+0x24d/0x2c0 ...... dma_atomic_pool_init+0xdb/0x176 do_one_initcall+0x67/0x320 ? rcu_read_lock_sched_held+0x3f/0x80 kernel_init_freeable+0x290/0x2dc ? rest_init+0x24f/0x24f kernel_init+0xa/0x111 ret_from_fork+0x22/0x30 Mem-Info: ------------------------------------ ***Root cause: In the current kernel, it assumes that DMA zone must have managed pages and try to request pages if CONFIG_ZONE_DMA is enabled. While this is not always true. E.g in kdump kernel of x86_64, only low 1M is presented and locked down at very early stage of boot, so that this low 1M won't be added into buddy allocator to become managed pages of DMA zone. This exception will always cause page allocation failure if page is requested from DMA zone. ***Investigation: This failure happens since below commit merged into linus's tree. 1a6a9044b967 x86/setup: Remove CONFIG_X86_RESERVE_LOW and reservelow= options 23721c8e92f7 x86/crash: Remove crash_reserve_low_1M() f1d4d47c5851 x86/setup: Always reserve the first 1M of RAM 7c321eb2b843 x86/kdump: Remove the backup region handling 6f599d84231f x86/kdump: Always reserve the low 1M when the crashkernel option is specified Before them, on x86_64, the low 640K area will be reused by kdump kernel. So in kdump kernel, the content of low 640K area is copied into a backup region for dumping before jumping into kdump. Then except of those firmware reserved region in [0, 640K], the left area will be added into buddy allocator to become available managed pages of DMA zone. However, after above commits applied, in kdump kernel of x86_64, the low 1M is reserved by memblock, but not released to buddy allocator. So any later page allocation requested from DMA zone will fail. At the beginning, if crashkernel is reserved, the low 1M need be locked down because AMD SME encrypts memory making the old backup region mechanims impossible when switching into kdump kernel. Later, it was also observed that there are BIOSes corrupting memory under 1M. To solve this, in commit f1d4d47c5851, the entire region of low 1M is always reserved after the real mode trampoline is allocated. Besides, recently, Intel engineer mentioned their TDX (Trusted domain extensions) which is under development in kernel also needs to lock down the low 1M. So we can't simply revert above commits to fix the page allocation failure from DMA zone as someone suggested. ***Solution: Currently, only DMA atomic pool and dma-kmalloc will initialize and request page allocation with GFP_DMA during bootup. So only initializ DMA atomic pool when DMA zone has available managed pages, otherwise just skip the initialization. For dma-kmalloc(), for the time being, let's mute the warning of allocation failure if requesting pages from DMA zone while no manged pages. Meanwhile, change code to use dma_alloc_xx/dma_map_xx API to replace kmalloc(GFP_DMA), or do not use GFP_DMA when calling kmalloc() if not necessary. Christoph is posting patches to fix those under drivers/scsi/. Finally, we can remove the need of dma-kmalloc() as people suggested. This patch (of 3): In some places of the current kernel, it assumes that dma zone must have managed pages if CONFIG_ZONE_DMA is enabled. While this is not always true. E.g in kdump kernel of x86_64, only low 1M is presented and locked down at very early stage of boot, so that there's no managed pages at all in DMA zone. This exception will always cause page allocation failure if page is requested from DMA zone. Here add function has_managed_dma() and the relevant helper functions to check if there's DMA zone with managed pages. It will be used in later patches. Link: https://lkml.kernel.org/r/20211223094435.248523-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20211223094435.248523-2-bhe@redhat.com Fixes: 6f599d84231f ("x86/kdump: Always reserve the low 1M when the crashkernel option is specified") Signed-off-by: Baoquan He Reviewed-by: David Hildenbrand Acked-by: John Donnelly Cc: Christoph Hellwig Cc: Christoph Lameter Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: David Laight Cc: Borislav Petkov Cc: Marek Szyprowski Cc: Robin Murphy Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 936dc0b6c226..aed44e9b5d89 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1047,6 +1047,15 @@ static inline int is_highmem_idx(enum zone_type idx) #endif } +#ifdef CONFIG_ZONE_DMA +bool has_managed_dma(void); +#else +static inline bool has_managed_dma(void) +{ + return false; +} +#endif + /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references -- cgit v1.2.3 From f47761999052b1cc987dd3e3d3adf47997358fc0 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Fri, 14 Jan 2022 14:07:48 -0800 Subject: hugetlb: add hugetlb.*.numa_stat file For hugetlb backed jobs/VMs it's critical to understand the numa information for the memory backing these jobs to deliver optimal performance. Currently this technically can be queried from /proc/self/numa_maps, but there are significant issues with that. Namely: 1. Memory can be mapped or unmapped. 2. numa_maps are per process and need to be aggregated across all processes in the cgroup. For shared memory this is more involved as the userspace needs to make sure it doesn't double count shared mappings. 3. I believe querying numa_maps needs to hold the mmap_lock which adds to the contention on this lock. For these reasons I propose simply adding hugetlb.*.numa_stat file, which shows the numa information of the cgroup similarly to memory.numa_stat. On cgroup-v2: cat /sys/fs/cgroup/unified/test/hugetlb.2MB.numa_stat total=2097152 N0=2097152 N1=0 On cgroup-v1: cat /sys/fs/cgroup/hugetlb/test/hugetlb.2MB.numa_stat total=2097152 N0=2097152 N1=0 hierarichal_total=2097152 N0=2097152 N1=0 This patch was tested manually by allocating hugetlb memory and querying the hugetlb.*.numa_stat file of the cgroup and its parents. [colin.i.king@googlemail.com: fix spelling mistake "hierarichal" -> "hierarchical"] Link: https://lkml.kernel.org/r/20211125090635.23508-1-colin.i.king@gmail.com [keescook@chromium.org: fix copy/paste array assignment] Link: https://lkml.kernel.org/r/20211203065647.2819707-1-keescook@chromium.org Link: https://lkml.kernel.org/r/20211123001020.4083653-1-almasrymina@google.com Signed-off-by: Mina Almasry Signed-off-by: Colin Ian King Signed-off-by: Kees Cook Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Shuah Khan Cc: Miaohe Lin Cc: Oscar Salvador Cc: Michal Hocko Cc: David Rientjes Cc: Jue Wang Cc: Yang Yao Cc: Joanna Li Cc: Cannon Matthews Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ++-- include/linux/hugetlb_cgroup.h | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 00351ccb49a3..d1897a69c540 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -622,8 +622,8 @@ struct hstate { #endif #ifdef CONFIG_CGROUP_HUGETLB /* cgroup control files */ - struct cftype cgroup_files_dfl[7]; - struct cftype cgroup_files_legacy[9]; + struct cftype cgroup_files_dfl[8]; + struct cftype cgroup_files_legacy[10]; #endif char name[HSTATE_NAME_LEN]; }; diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index ba025ae27882..379344828e78 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -36,6 +36,11 @@ enum hugetlb_memory_event { HUGETLB_NR_MEMORY_EVENTS, }; +struct hugetlb_cgroup_per_node { + /* hugetlb usage in pages over all hstates. */ + unsigned long usage[HUGE_MAX_HSTATE]; +}; + struct hugetlb_cgroup { struct cgroup_subsys_state css; @@ -57,6 +62,8 @@ struct hugetlb_cgroup { /* Handle for "hugetlb.events.local" */ struct cgroup_file events_local_file[HUGE_MAX_HSTATE]; + + struct hugetlb_cgroup_per_node *nodeinfo[]; }; static inline struct hugetlb_cgroup * -- cgit v1.2.3 From e9ea874a8ffb0f8ebed4f4981531a32c5b663d79 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Fri, 14 Jan 2022 14:07:55 -0800 Subject: mm/vmstat: add events for THP max_ptes_* exceeds There are interfaces to adjust max_ptes_none, max_ptes_swap, max_ptes_shared values, see /sys/kernel/mm/transparent_hugepage/khugepaged/. But system administrator may not know which value is the best. So Add those events to support adjusting max_ptes_* to suitable values. For example, if default max_ptes_swap value causes too much failures, and system uses zram whose IO is fast, administrator could increase max_ptes_swap until THP_SCAN_EXCEED_SWAP_PTE not increase anymore. Link: https://lkml.kernel.org/r/20211225094036.574157-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Cc: "Huang, Ying" Cc: Dave Hansen Cc: Minchan Kim Cc: Saravanan D Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index a185cc75ff52..7b2363388bfa 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -98,6 +98,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_SPLIT_PAGE_FAILED, THP_DEFERRED_SPLIT_PAGE, THP_SPLIT_PMD, + THP_SCAN_EXCEED_NONE_PTE, + THP_SCAN_EXCEED_SWAP_PTE, + THP_SCAN_EXCEED_SHARED_PTE, #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD THP_SPLIT_PUD, #endif -- cgit v1.2.3 From e4b424b7ec8791087375bb1f2480a3ba05d21e0b Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 14 Jan 2022 14:08:07 -0800 Subject: vmscan: make drop_slab_node static drop_slab_node is only used in drop_slab. So remove it's declaration from header file and add keyword static for it's definition. Link: https://lkml.kernel.org/r/20211111062445.5236-1-ligang.bdlg@bytedance.com Signed-off-by: Gang Li Reviewed-by: David Hildenbrand Reviewed-by: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index cef65f9cbdf2..eb67eb699b78 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3122,7 +3122,6 @@ int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *, #endif void drop_slab(void); -void drop_slab_node(int nid); #ifndef CONFIG_MMU #define randomize_va_space 0 -- cgit v1.2.3 From c6018b4b254971863bd0ad36bb5e7d0fa0f0ddb0 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 14 Jan 2022 14:08:17 -0800 Subject: mm/mempolicy: add set_mempolicy_home_node syscall This syscall can be used to set a home node for the MPOL_BIND and MPOL_PREFERRED_MANY memory policy. Users should use this syscall after setting up a memory policy for the specified range as shown below. mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node((unsigned long)p, nr_pages * page_size, home_node, 0); The syscall allows specifying a home node/preferred node from which kernel will fulfill memory allocation requests first. For address range with MPOL_BIND memory policy, if nodemask specifies more than one node, page allocations will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. For MPOL_PREFERRED_MANY if the nodemask specifies more than one node, page allocation will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. If there is not enough memory in all the nodes specified in the nodemask, the allocation will be attempted from the closest numa node to the home node in the system. This helps applications to hint at a memory allocation preference node and fallback to _only_ a set of nodes if the memory is not available on the preferred node. Fallback allocation is attempted from the node which is nearest to the preferred node. This helps applications to have control on memory allocation numa nodes and avoids default fallback to slow memory NUMA nodes. For example a system with NUMA nodes 1,2 and 3 with DRAM memory and 10, 11 and 12 of slow memory new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(new_nodes, 1); numa_bitmask_setbit(new_nodes, 2); numa_bitmask_setbit(new_nodes, 3); p = mmap(NULL, nr_pages * page_size, protflag, mapflag, -1, 0); mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node(p, nr_pages * page_size, 2, 0); This will allocate from nodes closer to node 2 and will make sure the kernel will only allocate from nodes 1, 2, and 3. Memory will not be allocated from slow memory nodes 10, 11, and 12. This differs from default MPOL_BIND behavior in that with default MPOL_BIND the allocation will be attempted from node closer to the local node. One of the reasons to specify a home node is to allow allocations from cpu less NUMA node and its nearby NUMA nodes. With MPOL_PREFERRED_MANY on the other hand will first try to allocate from the closest node to node 2 from the node list 1, 2 and 3. If those nodes don't have enough memory, kernel will allocate from slow memory node 10, 11 and 12 which ever is closer to node 2. Link: https://lkml.kernel.org/r/20211202123810.267175-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 3c7595e81150..668389b4b53d 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -46,6 +46,7 @@ struct mempolicy { unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ nodemask_t nodes; /* interleave/bind/perfer */ + int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ -- cgit v1.2.3 From 21b084fdf2a49ca1634e8e360e9ab6f9ff0dee11 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 14 Jan 2022 14:08:21 -0800 Subject: mm/mempolicy: wire up syscall set_mempolicy_home_node Link: https://lkml.kernel.org/r/20211202123810.267175-4-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/syscalls.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 528a478dbda8..819c0cb00b6d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1057,6 +1057,9 @@ asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type ru const void __user *rule_attr, __u32 flags); asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags); asmlinkage long sys_memfd_secret(unsigned int flags); +asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, + unsigned long home_node, + unsigned long flags); /* * Architecture-specific system calls -- cgit v1.2.3 From c9fdc4d5487a16bd1f003fc8b66e91f88efb50e6 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 14 Jan 2022 14:09:06 -0800 Subject: mm/hwpoison: remove MF_MSG_BUDDY_2ND and MF_MSG_POISONED_HUGE These action_page_types are no longer used, so remove them. Link: https://lkml.kernel.org/r/20211115084006.3728254-3-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Acked-by: Yang Shi Cc: "Aneesh Kumar K.V" Cc: David Hildenbrand Cc: Ding Hui Cc: Miaohe Lin Cc: Michal Hocko Cc: Oscar Salvador Cc: Peter Xu Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index eb67eb699b78..7f594da84aca 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3201,7 +3201,6 @@ enum mf_action_page_type { MF_MSG_KERNEL_HIGH_ORDER, MF_MSG_SLAB, MF_MSG_DIFFERENT_COMPOUND, - MF_MSG_POISONED_HUGE, MF_MSG_HUGE, MF_MSG_FREE_HUGE, MF_MSG_NON_PMD_HUGE, @@ -3216,7 +3215,6 @@ enum mf_action_page_type { MF_MSG_CLEAN_LRU, MF_MSG_TRUNCATED_LRU, MF_MSG_BUDDY, - MF_MSG_BUDDY_2ND, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_UNKNOWN, -- cgit v1.2.3 From bf181c582588f8f7406d52f2ee228539b465f173 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 14 Jan 2022 14:09:09 -0800 Subject: mm/hwpoison: fix unpoison_memory() After recent soft-offline rework, error pages can be taken off from buddy allocator, but the existing unpoison_memory() does not properly undo the operation. Moreover, due to the recent change on __get_hwpoison_page(), get_page_unless_zero() is hardly called for hwpoisoned pages. So __get_hwpoison_page() highly likely returns -EBUSY (meaning to fail to grab page refcount) and unpoison just clears PG_hwpoison without releasing a refcount. That does not lead to a critical issue like kernel panic, but unpoisoned pages never get back to buddy (leaked permanently), which is not good. To (partially) fix this, we need to identify "taken off" pages from other types of hwpoisoned pages. We can't use refcount or page flags for this purpose, so a pseudo flag is defined by hacking ->private field. Someone might think that put_page() is enough to cancel taken-off pages, but the normal free path contains some operations not suitable for the current purpose, and can fire VM_BUG_ON(). Note that unpoison_memory() is now supposed to be cancel hwpoison events injected only by madvise() or /sys/devices/system/memory/{hard,soft}_offline_page, not by MCE injection, so please don't try to use unpoison when testing with MCE injection. [lkp@intel.com: report build failure for ARCH=i386] Link: https://lkml.kernel.org/r/20211115084006.3728254-4-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reviewed-by: Yang Shi Cc: David Hildenbrand Cc: Oscar Salvador Cc: Michal Hocko Cc: Ding Hui Cc: Tony Luck Cc: "Aneesh Kumar K.V" Cc: Miaohe Lin Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + include/linux/page-flags.h | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7f594da84aca..d4fb49a5d60d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3174,6 +3174,7 @@ enum mf_flags { MF_ACTION_REQUIRED = 1 << 1, MF_MUST_KILL = 1 << 2, MF_SOFT_OFFLINE = 1 << 3, + MF_UNPOISON = 1 << 4, }; extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 18423c2157e8..7e2b90dc7d3f 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -522,7 +522,11 @@ PAGEFLAG_FALSE(Uncached, uncached) PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) +#define MAGIC_HWPOISON 0x48575053U /* HWPS */ +extern void SetPageHWPoisonTakenOff(struct page *page); +extern void ClearPageHWPoisonTakenOff(struct page *page); extern bool take_page_off_buddy(struct page *page); +extern bool put_page_back_buddy(struct page *page); #else PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 -- cgit v1.2.3 From 5ee2fa2f063649570c702164f47a558a3432dd9e Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 14 Jan 2022 14:09:16 -0800 Subject: mm/rmap: fix potential batched TLB flush race In theory, the following race is possible for batched TLB flushing. CPU0 CPU1 ---- ---- shrink_page_list() unmap zap_pte_range() flush_tlb_batched_pending() flush_tlb_mm() try_to_unmap() set_tlb_ubc_flush_pending() mm->tlb_flush_batched = true mm->tlb_flush_batched = false After the TLB is flushed on CPU1 via flush_tlb_mm() and before mm->tlb_flush_batched is set to false, some PTE is unmapped on CPU0 and the TLB flushing is pended. Then the pended TLB flushing will be lost. Although both set_tlb_ubc_flush_pending() and flush_tlb_batched_pending() are called with PTL locked, different PTL instances may be used. Because the race window is really small, and the lost TLB flushing will cause problem only if a TLB entry is inserted before the unmapping in the race window, the race is only theoretical. But the fix is simple and cheap too. Syzbot has reported this too as follows: ================================================================== BUG: KCSAN: data-race in flush_tlb_batched_pending / try_to_unmap_one write to 0xffff8881072cfbbc of 1 bytes by task 17406 on cpu 1: flush_tlb_batched_pending+0x5f/0x80 mm/rmap.c:691 madvise_free_pte_range+0xee/0x7d0 mm/madvise.c:594 walk_pmd_range mm/pagewalk.c:128 [inline] walk_pud_range mm/pagewalk.c:205 [inline] walk_p4d_range mm/pagewalk.c:240 [inline] walk_pgd_range mm/pagewalk.c:277 [inline] __walk_page_range+0x981/0x1160 mm/pagewalk.c:379 walk_page_range+0x131/0x300 mm/pagewalk.c:475 madvise_free_single_vma mm/madvise.c:734 [inline] madvise_dontneed_free mm/madvise.c:822 [inline] madvise_vma mm/madvise.c:996 [inline] do_madvise+0xe4a/0x1140 mm/madvise.c:1202 __do_sys_madvise mm/madvise.c:1228 [inline] __se_sys_madvise mm/madvise.c:1226 [inline] __x64_sys_madvise+0x5d/0x70 mm/madvise.c:1226 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae write to 0xffff8881072cfbbc of 1 bytes by task 71 on cpu 0: set_tlb_ubc_flush_pending mm/rmap.c:636 [inline] try_to_unmap_one+0x60e/0x1220 mm/rmap.c:1515 rmap_walk_anon+0x2fb/0x470 mm/rmap.c:2301 try_to_unmap+0xec/0x110 shrink_page_list+0xe91/0x2620 mm/vmscan.c:1719 shrink_inactive_list+0x3fb/0x730 mm/vmscan.c:2394 shrink_list mm/vmscan.c:2621 [inline] shrink_lruvec+0x3c9/0x710 mm/vmscan.c:2940 shrink_node_memcgs+0x23e/0x410 mm/vmscan.c:3129 shrink_node+0x8f6/0x1190 mm/vmscan.c:3252 kswapd_shrink_node mm/vmscan.c:4022 [inline] balance_pgdat+0x702/0xd30 mm/vmscan.c:4213 kswapd+0x200/0x340 mm/vmscan.c:4473 kthread+0x2c7/0x2e0 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 value changed: 0x01 -> 0x00 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 71 Comm: kswapd0 Not tainted 5.16.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 ================================================================== [akpm@linux-foundation.org: tweak comments] Link: https://lkml.kernel.org/r/20211201021104.126469-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reported-by: syzbot+aa5bebed695edaccf0df@syzkaller.appspotmail.com Cc: Nadav Amit Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Dave Hansen Cc: Will Deacon Cc: Yu Zhao Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6a89f128c990..e3b0476a4fda 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -647,7 +647,7 @@ struct mm_struct { atomic_t tlb_flush_pending; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* See flush_tlb_batched_pending() */ - bool tlb_flush_batched; + atomic_t tlb_flush_batched; #endif struct uprobes_state uprobes_state; #ifdef CONFIG_PREEMPT_RT -- cgit v1.2.3 From cab0a7c115546a4865fb7439558af9077a569574 Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Fri, 14 Jan 2022 14:09:28 -0800 Subject: mm: make some vars and functions static or __init "page_idle_ops" as a global var, but its scope of use within this document. So it should be static. "page_ext_ops" is a var used in the kernel initial phase. And other functions are aslo used in the kernel initial phase. So they should be __init or __initdata to reclaim memory. Link: https://lkml.kernel.org/r/20211217095023.67293-1-liuting.0x7c00@bytedance.com Signed-off-by: Ting Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_idle.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 83abf95e9fa7..4663dfed1293 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -13,7 +13,6 @@ * If there is not enough space to store Idle and Young bits in page flags, use * page ext flags instead. */ -extern struct page_ext_operations page_idle_ops; static inline bool folio_test_young(struct folio *folio) { -- cgit v1.2.3 From cdeed009f3bceee41f73f0137db785fd29a05cb8 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 14 Jan 2022 14:09:44 -0800 Subject: mm/damon: remove some unneeded function definitions in damon.h In damon.h some func definitions about VA & PA can only be used in its own file, so there no need to define in the header file, and the header file will look cleaner. If other files later need these functions, the prototypes can be added to damon.h at that time. [sj@kernel.org: remove unnecessary function prototype position changes] Link: https://lkml.kernel.org/r/20211118114827.20052-1-sj@kernel.org Link: https://lkml.kernel.org/r/45fd5b3ef6cce8e28dbc1c92f9dc845ccfc949d7.1636989871.git.xhao@linux.alibaba.com Signed-off-by: Xin Hao Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index b4d4be3cc987..1d1be348f506 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -461,34 +461,13 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); #endif /* CONFIG_DAMON */ #ifdef CONFIG_DAMON_VADDR - -/* Monitoring primitives for virtual memory address spaces */ -void damon_va_init(struct damon_ctx *ctx); -void damon_va_update(struct damon_ctx *ctx); -void damon_va_prepare_access_checks(struct damon_ctx *ctx); -unsigned int damon_va_check_accesses(struct damon_ctx *ctx); bool damon_va_target_valid(void *t); -void damon_va_cleanup(struct damon_ctx *ctx); -int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); -int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); void damon_va_set_primitives(struct damon_ctx *ctx); - #endif /* CONFIG_DAMON_VADDR */ #ifdef CONFIG_DAMON_PADDR - -/* Monitoring primitives for the physical memory address space */ -void damon_pa_prepare_access_checks(struct damon_ctx *ctx); -unsigned int damon_pa_check_accesses(struct damon_ctx *ctx); bool damon_pa_target_valid(void *t); -int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); -int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); void damon_pa_set_primitives(struct damon_ctx *ctx); - #endif /* CONFIG_DAMON_PADDR */ #endif /* _DAMON_H */ -- cgit v1.2.3 From 9b2a38d6ef25c1748e3964b0ff30a89e4ed26583 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 14 Jan 2022 14:09:53 -0800 Subject: mm/damon: move damon_rand() definition into damon.h damon_rand() is called in three files:damon/core.c, damon/ paddr.c, damon/vaddr.c, i think there is no need to redefine this twice, So move it to damon.h will be a good choice. Link: https://lkml.kernel.org/r/20211202075859.51341-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 1d1be348f506..3e91a597a1aa 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -11,12 +11,16 @@ #include #include #include +#include /* Minimal region size. Every damon_region is aligned by this. */ #define DAMON_MIN_REGION PAGE_SIZE /* Max priority score for DAMON-based operation schemes */ #define DAMOS_MAX_SCORE (99) +/* Get a random number in [l, r) */ +#define damon_rand(l, r) (l + prandom_u32_max(r - l)) + /** * struct damon_addr_range - Represents an address region of [@start, @end). * @start: Start address of the region (inclusive). -- cgit v1.2.3 From 234d68732b6c135087bdebfa0630a43ae8c27758 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 14 Jan 2022 14:09:56 -0800 Subject: mm/damon: modify damon_rand() macro to static inline function damon_rand() cannot be implemented as a macro. Example: damon_rand(a++, b); The value of 'a' will be incremented twice, This is obviously unreasonable, So there fix it. Link: https://lkml.kernel.org/r/110ffcd4e420c86c42b41ce2bc9f0fe6a4f32cd3.1638795127.git.xhao@linux.alibaba.com Fixes: b9a6ac4e4ede ("mm/damon: adaptively adjust regions") Signed-off-by: Xin Hao Reported-by: Andrew Morton Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 3e91a597a1aa..e2c8152985b7 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -19,7 +19,10 @@ #define DAMOS_MAX_SCORE (99) /* Get a random number in [l, r) */ -#define damon_rand(l, r) (l + prandom_u32_max(r - l)) +static inline unsigned long damon_rand(unsigned long l, unsigned long r) +{ + return l + prandom_u32_max(r - l); +} /** * struct damon_addr_range - Represents an address region of [@start, @end). -- cgit v1.2.3 From 88f86dcfa454784f7de550966c60fc78a3e95d6d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:09:59 -0800 Subject: mm/damon: convert macro functions to static inline functions Patch series "mm/damon: Misc cleanups". This patchset contains miscellaneous cleanups for DAMON's macro functions and documentation. This patch (of 6): This commit converts macro functions in DAMON to static inline functions, for better type checking, code documentation, etc[1]. [1] https://lore.kernel.org/linux-mm/20211202151213.6ec830863342220da4141bc5@linux-foundation.org/ Link: https://lkml.kernel.org/r/20211209131806.19317-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211209131806.19317-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index e2c8152985b7..2dbc1f545da2 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -399,14 +399,20 @@ struct damon_ctx { struct list_head schemes; }; -#define damon_next_region(r) \ - (container_of(r->list.next, struct damon_region, list)) +static inline struct damon_region *damon_next_region(struct damon_region *r) +{ + return container_of(r->list.next, struct damon_region, list); +} -#define damon_prev_region(r) \ - (container_of(r->list.prev, struct damon_region, list)) +static inline struct damon_region *damon_prev_region(struct damon_region *r) +{ + return container_of(r->list.prev, struct damon_region, list); +} -#define damon_last_region(t) \ - (list_last_entry(&t->regions_list, struct damon_region, list)) +static inline struct damon_region *damon_last_region(struct damon_target *t) +{ + return list_last_entry(&t->regions_list, struct damon_region, list); +} #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) -- cgit v1.2.3 From f4c6d22c6cf282ef7d24a724b9bd978ee2b74fc6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:14 -0800 Subject: mm/damon: remove a mistakenly added comment for a future feature Due to a mistake in patches reordering, a comment for a future feature called 'arbitrary monitoring target support'[1], which is still under development, has added. Because it only introduces confusion and we don't have a plan to post the patches soon, this commit removes the mistakenly added part. [1] https://lore.kernel.org/linux-mm/20201215115448.25633-3-sjpark@amazon.com/ Link: https://lkml.kernel.org/r/20211209131806.19317-7-sj@kernel.org Fixes: 1f366e421c8f ("mm/damon/core: implement DAMON-based Operation Schemes (DAMOS)") Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 2dbc1f545da2..97f4a224e950 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -281,7 +281,7 @@ struct damon_ctx; * as an integer in [0, &DAMOS_MAX_SCORE]. * @apply_scheme is called from @kdamond when a region for user provided * DAMON-based operation scheme is found. It should apply the scheme's action - * to the region. This is not used for &DAMON_ARBITRARY_TARGET case. + * to the region. * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup is called from @kdamond just before its termination. -- cgit v1.2.3 From 0e92c2ee9f459542c5384d9cfab24873c3dd6398 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:17 -0800 Subject: mm/damon/schemes: account scheme actions that successfully applied Patch series "mm/damon/schemes: Extend stats for better online analysis and tuning". To help online access pattern analysis and tuning of DAMON-based Operation Schemes (DAMOS), DAMOS provides simple statistics for each scheme. Introduction of DAMOS time/space quota further made the tuning easier by making the risk management easier. However, that also made understanding of the working schemes a little bit more difficult. For an example, progress of a given scheme can now be throttled by not only the aggressiveness of the target access pattern, but also the time/space quotas. So, when a scheme is showing unexpectedly slow progress, it's difficult to know by what the progress of the scheme is throttled, with currently provided statistics. This patchset extends the statistics to contain some metrics that can be helpful for such online schemes analysis and tuning (patches 1-2), exports those to users (patches 3 and 5), and add documents (patches 4 and 6). This patch (of 6): DAMON-based operation schemes (DAMOS) stats provide only the number and the amount of regions that the action of the scheme has tried to be applied. Because the action could be failed for some reasons, the currently provided information is sometimes not useful or convenient enough for schemes profiling and tuning. To improve this situation, this commit extends the DAMOS stats to provide the number and the amount of regions that the action has successfully applied. Link: https://lkml.kernel.org/r/20211210150016.35349-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211210150016.35349-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 97f4a224e950..e0ad3d9aaeed 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -192,6 +192,20 @@ struct damos_watermarks { bool activated; }; +/** + * struct damos_stat - Statistics on a given scheme. + * @nr_tried: Total number of regions that the scheme is tried to be applied. + * @sz_tried: Total size of regions that the scheme is tried to be applied. + * @nr_applied: Total number of regions that the scheme is applied. + * @sz_applied: Total size of regions that the scheme is applied. + */ +struct damos_stat { + unsigned long nr_tried; + unsigned long sz_tried; + unsigned long nr_applied; + unsigned long sz_applied; +}; + /** * struct damos - Represents a Data Access Monitoring-based Operation Scheme. * @min_sz_region: Minimum size of target regions. @@ -203,8 +217,7 @@ struct damos_watermarks { * @action: &damo_action to be applied to the target regions. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. - * @stat_count: Total number of regions that this scheme is applied. - * @stat_sz: Total size of regions that this scheme is applied. + * @stat: Statistics of this scheme. * @list: List head for siblings. * * For each aggregation interval, DAMON finds regions which fit in the @@ -235,8 +248,7 @@ struct damos { enum damos_action action; struct damos_quota quota; struct damos_watermarks wmarks; - unsigned long stat_count; - unsigned long stat_sz; + struct damos_stat stat; struct list_head list; }; @@ -281,7 +293,8 @@ struct damon_ctx; * as an integer in [0, &DAMOS_MAX_SCORE]. * @apply_scheme is called from @kdamond when a region for user provided * DAMON-based operation scheme is found. It should apply the scheme's action - * to the region. + * to the region and return bytes of the region that the action is successfully + * applied. * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup is called from @kdamond just before its termination. @@ -295,8 +308,9 @@ struct damon_primitive { int (*get_scheme_score)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme); - int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); + unsigned long (*apply_scheme)(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme); bool (*target_valid)(void *target); void (*cleanup)(struct damon_ctx *context); }; -- cgit v1.2.3 From 6268eac34ca30af7f6313504d556ec7fcd295621 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:20 -0800 Subject: mm/damon/schemes: account how many times quota limit has exceeded If the time/space quotas of a given DAMON-based operation scheme is too small, the scheme could show unexpectedly slow progress. However, there is no good way to notice the case in runtime. This commit extends the DAMOS stat to provide how many times the quota limits exceeded so that the users can easily notice the case and tune the scheme. Link: https://lkml.kernel.org/r/20211210150016.35349-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index e0ad3d9aaeed..af648388e759 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -198,12 +198,14 @@ struct damos_watermarks { * @sz_tried: Total size of regions that the scheme is tried to be applied. * @nr_applied: Total number of regions that the scheme is applied. * @sz_applied: Total size of regions that the scheme is applied. + * @qt_exceeds: Total number of times the quota of the scheme has exceeded. */ struct damos_stat { unsigned long nr_tried; unsigned long sz_tried; unsigned long nr_applied; unsigned long sz_applied; + unsigned long qt_exceeds; }; /** -- cgit v1.2.3 From 2cd4b8e10cc31eadb5b10b1d73b3f28156f3776c Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 14 Jan 2022 14:10:38 -0800 Subject: mm/damon: move the implementation of damon_insert_region to damon.h Usually, inline function is declared static since it should sit between storage and type. And implement it in a header file if used by multiple files. And this change also fixes compile issue when backport damon to 5.10. mm/damon/vaddr.c: In function `damon_va_evenly_split_region': ./include/linux/damon.h:425:13: error: inlining failed in call to `always_inline' `damon_insert_region': function body not available 425 | inline void damon_insert_region(struct damon_region *r, | ^~~~~~~~~~~~~~~~~~~ mm/damon/vaddr.c:86:3: note: called from here 86 | damon_insert_region(n, r, next, t); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Link: https://lkml.kernel.org/r/20211223085703.6142-1-guoqing.jiang@linux.dev Signed-off-by: Guoqing Jiang Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index af648388e759..5e1e3a128b77 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -451,9 +451,18 @@ static inline struct damon_region *damon_last_region(struct damon_target *t) #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); -inline void damon_insert_region(struct damon_region *r, + +/* + * Add a region between two other regions + */ +static inline void damon_insert_region(struct damon_region *r, struct damon_region *prev, struct damon_region *next, - struct damon_target *t); + struct damon_target *t) +{ + __list_add(&r->list, &prev->list, &next->list); + t->nr_regions++; +} + void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); -- cgit v1.2.3 From 47d8c15615c0a2046d2d90b04cb80b81ddf31fb1 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:16:59 -0700 Subject: include: move find.h from asm_generic to linux find_bit API and bitmap API are closely related, but inclusion paths are different - include/asm-generic and include/linux, correspondingly. In the past it made a lot of troubles due to circular dependencies and/or undefined symbols. Fix this by moving find.h under include/linux. Signed-off-by: Yury Norov Tested-by: Wolfram Sang Acked-by: Geert Uytterhoeven --- include/linux/bitmap.h | 1 + include/linux/find.h | 268 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 269 insertions(+) create mode 100644 include/linux/find.h (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index a241dcf50f39..ead4a150bd7f 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -6,6 +6,7 @@ #include #include +#include #include #include #include diff --git a/include/linux/find.h b/include/linux/find.h new file mode 100644 index 000000000000..c5410c243e04 --- /dev/null +++ b/include/linux/find.h @@ -0,0 +1,268 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_FIND_H_ +#define __LINUX_FIND_H_ + +#ifndef __LINUX_BITMAP_H +#error only can be included directly +#endif + +#include + +extern unsigned long _find_next_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long nbits, + unsigned long start, unsigned long invert, unsigned long le); +extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); + +#ifndef find_next_bit +/** + * find_next_bit - find the next set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The bitmap size in bits + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +static inline +unsigned long find_next_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + + return _find_next_bit(addr, NULL, size, offset, 0UL, 0); +} +#endif + +#ifndef find_next_and_bit +/** + * find_next_and_bit - find the next set bit in both memory regions + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @offset: The bitnumber to start searching at + * @size: The bitmap size in bits + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +static inline +unsigned long find_next_and_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size, + unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr1 & *addr2 & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + + return _find_next_bit(addr1, addr2, size, offset, 0UL, 0); +} +#endif + +#ifndef find_next_zero_bit +/** + * find_next_zero_bit - find the next cleared bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The bitmap size in bits + * + * Returns the bit number of the next zero bit + * If no bits are zero, returns @size. + */ +static inline +unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr | ~GENMASK(size - 1, offset); + return val == ~0UL ? size : ffz(val); + } + + return _find_next_bit(addr, NULL, size, offset, ~0UL, 0); +} +#endif + +#ifdef CONFIG_GENERIC_FIND_FIRST_BIT + +#ifndef find_first_bit +/** + * find_first_bit - find the first set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum number of bits to search + * + * Returns the bit number of the first set bit. + * If no bits are set, returns @size. + */ +static inline +unsigned long find_first_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr & GENMASK(size - 1, 0); + + return val ? __ffs(val) : size; + } + + return _find_first_bit(addr, size); +} +#endif + +#ifndef find_first_zero_bit +/** + * find_first_zero_bit - find the first cleared bit in a memory region + * @addr: The address to start the search at + * @size: The maximum number of bits to search + * + * Returns the bit number of the first cleared bit. + * If no bits are zero, returns @size. + */ +static inline +unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr | ~GENMASK(size - 1, 0); + + return val == ~0UL ? size : ffz(val); + } + + return _find_first_zero_bit(addr, size); +} +#endif + +#else /* CONFIG_GENERIC_FIND_FIRST_BIT */ + +#ifndef find_first_bit +#define find_first_bit(addr, size) find_next_bit((addr), (size), 0) +#endif +#ifndef find_first_zero_bit +#define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0) +#endif + +#endif /* CONFIG_GENERIC_FIND_FIRST_BIT */ + +#ifndef find_last_bit +/** + * find_last_bit - find the last set bit in a memory region + * @addr: The address to start the search at + * @size: The number of bits to search + * + * Returns the bit number of the last set bit, or size. + */ +static inline +unsigned long find_last_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr & GENMASK(size - 1, 0); + + return val ? __fls(val) : size; + } + + return _find_last_bit(addr, size); +} +#endif + +/** + * find_next_clump8 - find next 8-bit clump with set bits in a memory region + * @clump: location to store copy of found clump + * @addr: address to base the search on + * @size: bitmap size in number of bits + * @offset: bit offset at which to start searching + * + * Returns the bit offset for the next set clump; the found clump value is + * copied to the location pointed by @clump. If no bits are set, returns @size. + */ +extern unsigned long find_next_clump8(unsigned long *clump, + const unsigned long *addr, + unsigned long size, unsigned long offset); + +#define find_first_clump8(clump, bits, size) \ + find_next_clump8((clump), (bits), (size), 0) + +#if defined(__LITTLE_ENDIAN) + +static inline unsigned long find_next_zero_bit_le(const void *addr, + unsigned long size, unsigned long offset) +{ + return find_next_zero_bit(addr, size, offset); +} + +static inline unsigned long find_next_bit_le(const void *addr, + unsigned long size, unsigned long offset) +{ + return find_next_bit(addr, size, offset); +} + +static inline unsigned long find_first_zero_bit_le(const void *addr, + unsigned long size) +{ + return find_first_zero_bit(addr, size); +} + +#elif defined(__BIG_ENDIAN) + +#ifndef find_next_zero_bit_le +static inline +unsigned long find_next_zero_bit_le(const void *addr, unsigned + long size, unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val = *(const unsigned long *)addr; + + if (unlikely(offset >= size)) + return size; + + val = swab(val) | ~GENMASK(size - 1, offset); + return val == ~0UL ? size : ffz(val); + } + + return _find_next_bit(addr, NULL, size, offset, ~0UL, 1); +} +#endif + +#ifndef find_next_bit_le +static inline +unsigned long find_next_bit_le(const void *addr, unsigned + long size, unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val = *(const unsigned long *)addr; + + if (unlikely(offset >= size)) + return size; + + val = swab(val) & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + + return _find_next_bit(addr, NULL, size, offset, 0UL, 1); +} +#endif + +#ifndef find_first_zero_bit_le +#define find_first_zero_bit_le(addr, size) \ + find_next_zero_bit_le((addr), (size), 0) +#endif + +#else +#error "Please fix " +#endif + +#endif /*__LINUX_FIND_H_ */ -- cgit v1.2.3 From c126a53c276048125b4a950072bab37ad0fea120 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:00 -0700 Subject: arch: remove GENERIC_FIND_FIRST_BIT entirely In 5.12 cycle we enabled GENERIC_FIND_FIRST_BIT config option for ARM64 and MIPS. It increased performance and shrunk .text size; and so far I didn't receive any negative feedback on the change. https://lore.kernel.org/linux-arch/20210225135700.1381396-1-yury.norov@gmail.com/ Now I think it's a good time to switch all architectures to use find_{first,last}_bit() unconditionally, and so remove corresponding config option. The patch does't introduce functioal changes for arc, arm, arm64, mips, m68k, s390 and x86, for other architectures I expect improvement both in performance and .text size. Signed-off-by: Yury Norov Tested-by: Alexander Lobakin (mips) Reviewed-by: Alexander Lobakin (mips) Reviewed-by: Andy Shevchenko Acked-by: Will Deacon Tested-by: Wolfram Sang --- include/linux/find.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/find.h b/include/linux/find.h index c5410c243e04..ea57f7f38c49 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -101,8 +101,6 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, } #endif -#ifdef CONFIG_GENERIC_FIND_FIRST_BIT - #ifndef find_first_bit /** * find_first_bit - find the first set bit in a memory region @@ -147,17 +145,6 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) } #endif -#else /* CONFIG_GENERIC_FIND_FIRST_BIT */ - -#ifndef find_first_bit -#define find_first_bit(addr, size) find_next_bit((addr), (size), 0) -#endif -#ifndef find_first_zero_bit -#define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0) -#endif - -#endif /* CONFIG_GENERIC_FIND_FIRST_BIT */ - #ifndef find_last_bit /** * find_last_bit - find the last set bit in a memory region -- cgit v1.2.3 From f68edc9297bf3f7c94abb54b9b0b053607f7587b Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:01 -0700 Subject: lib: add find_first_and_bit() Currently find_first_and_bit() is an alias to find_next_and_bit(). However, it is widely used in cpumask, so it worth to optimize it. This patch adds its own implementation for find_first_and_bit(). On x86_64 find_bit_benchmark says: Before (#define find_first_and_bit(...) find_next_and_bit(..., 0): Start testing find_bit() with random-filled bitmap [ 140.291468] find_first_and_bit: 46890919 ns, 32671 iterations Start testing find_bit() with sparse bitmap [ 140.295028] find_first_and_bit: 7103 ns, 1 iterations After: Start testing find_bit() with random-filled bitmap [ 162.574907] find_first_and_bit: 25045813 ns, 32846 iterations Start testing find_bit() with sparse bitmap [ 162.578458] find_first_and_bit: 4900 ns, 1 iterations (Thanks to Alexey Klimov for thorough testing.) Signed-off-by: Yury Norov Tested-by: Wolfram Sang Tested-by: Alexey Klimov --- include/linux/find.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/find.h b/include/linux/find.h index ea57f7f38c49..6048f8c97418 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -12,6 +12,8 @@ extern unsigned long _find_next_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start, unsigned long invert, unsigned long le); extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size); extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); @@ -123,6 +125,31 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) } #endif +#ifndef find_first_and_bit +/** + * find_first_and_bit - find the first set bit in both memory regions + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @size: The bitmap size in bits + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +static inline +unsigned long find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, + unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); + + return val ? __ffs(val) : size; + } + + return _find_first_and_bit(addr1, addr2, size); +} +#endif + #ifndef find_first_zero_bit /** * find_first_zero_bit - find the first cleared bit in a memory region -- cgit v1.2.3 From 93ba139ba8190c33009c5353ca43c8519443f467 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:02 -0700 Subject: cpumask: use find_first_and_bit() Now we have an efficient implementation for find_first_and_bit(), so switch cpumask to use it where appropriate. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- include/linux/cpumask.h | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 1e7399fc69c0..c4e1b9ea0ba4 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -123,6 +123,12 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return 0; } +static inline unsigned int cpumask_first_and(const struct cpumask *srcp1, + const struct cpumask *srcp2) +{ + return 0; +} + static inline unsigned int cpumask_last(const struct cpumask *srcp) { return 0; @@ -167,7 +173,7 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node) static inline int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p) { - return cpumask_next_and(-1, src1p, src2p); + return cpumask_first_and(src1p, src2p); } static inline int cpumask_any_distribute(const struct cpumask *srcp) @@ -195,6 +201,19 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); } +/** + * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 + * @src1p: the first input + * @src2p: the second input + * + * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). + */ +static inline +unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2) +{ + return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits); +} + /** * cpumask_last - get the last CPU in a cpumask * @srcp: - the cpumask pointer @@ -585,15 +604,6 @@ static inline void cpumask_copy(struct cpumask *dstp, */ #define cpumask_any(srcp) cpumask_first(srcp) -/** - * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 - * @src1p: the first input - * @src2p: the second input - * - * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). - */ -#define cpumask_first_and(src1p, src2p) cpumask_next_and(-1, (src1p), (src2p)) - /** * cpumask_any_and - pick a "random" cpu from *mask1 & *mask2 * @mask1: the first input cpumask -- cgit v1.2.3 From 9b51d9d866482a703646fd4c07e433c3d9d88efd Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:05 -0700 Subject: cpumask: replace cpumask_next_* with cpumask_first_* where appropriate cpumask_first() is a more effective analogue of 'next' version if n == -1 (which means start == 0). This patch replaces 'next' with 'first' where things look trivial. There's no cpumask_first_zero() function, so create it. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- include/linux/cpumask.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index c4e1b9ea0ba4..64dae70d31f5 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -123,6 +123,11 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return 0; } +static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) +{ + return 0; +} + static inline unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { @@ -201,6 +206,17 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); } +/** + * cpumask_first_zero - get the first unset cpu in a cpumask + * @srcp: the cpumask pointer + * + * Returns >= nr_cpu_ids if all cpus are set. + */ +static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) +{ + return find_first_zero_bit(cpumask_bits(srcp), nr_cpumask_bits); +} + /** * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 * @src1p: the first input -- cgit v1.2.3 From bc9d6635c293a2ac30c6319f7cfd08860ab7948a Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:06 -0700 Subject: include/linux: move for_each_bit() macros from bitops.h to find.h for_each_bit() macros depend on find_bit() machinery, and so the proper place for them is the find.h header. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- include/linux/bitops.h | 34 ---------------------------------- include/linux/find.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 5e62e2383b7f..7aaed501f768 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -32,40 +32,6 @@ extern unsigned long __sw_hweight64(__u64 w); */ #include -#define for_each_set_bit(bit, addr, size) \ - for ((bit) = find_first_bit((addr), (size)); \ - (bit) < (size); \ - (bit) = find_next_bit((addr), (size), (bit) + 1)) - -/* same as for_each_set_bit() but use bit as value to start with */ -#define for_each_set_bit_from(bit, addr, size) \ - for ((bit) = find_next_bit((addr), (size), (bit)); \ - (bit) < (size); \ - (bit) = find_next_bit((addr), (size), (bit) + 1)) - -#define for_each_clear_bit(bit, addr, size) \ - for ((bit) = find_first_zero_bit((addr), (size)); \ - (bit) < (size); \ - (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) - -/* same as for_each_clear_bit() but use bit as value to start with */ -#define for_each_clear_bit_from(bit, addr, size) \ - for ((bit) = find_next_zero_bit((addr), (size), (bit)); \ - (bit) < (size); \ - (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) - -/** - * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits - * @start: bit offset to start search and to store the current iteration offset - * @clump: location to store copy of current 8-bit clump - * @bits: bitmap address to base the search on - * @size: bitmap size in number of bits - */ -#define for_each_set_clump8(start, clump, bits, size) \ - for ((start) = find_first_clump8(&(clump), (bits), (size)); \ - (start) < (size); \ - (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8)) - static inline int get_bitmask_order(unsigned int count) { int order; diff --git a/include/linux/find.h b/include/linux/find.h index 6048f8c97418..4500e8ab93e2 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -279,4 +279,38 @@ unsigned long find_next_bit_le(const void *addr, unsigned #error "Please fix " #endif +#define for_each_set_bit(bit, addr, size) \ + for ((bit) = find_first_bit((addr), (size)); \ + (bit) < (size); \ + (bit) = find_next_bit((addr), (size), (bit) + 1)) + +/* same as for_each_set_bit() but use bit as value to start with */ +#define for_each_set_bit_from(bit, addr, size) \ + for ((bit) = find_next_bit((addr), (size), (bit)); \ + (bit) < (size); \ + (bit) = find_next_bit((addr), (size), (bit) + 1)) + +#define for_each_clear_bit(bit, addr, size) \ + for ((bit) = find_first_zero_bit((addr), (size)); \ + (bit) < (size); \ + (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) + +/* same as for_each_clear_bit() but use bit as value to start with */ +#define for_each_clear_bit_from(bit, addr, size) \ + for ((bit) = find_next_zero_bit((addr), (size), (bit)); \ + (bit) < (size); \ + (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) + +/** + * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits + * @start: bit offset to start search and to store the current iteration offset + * @clump: location to store copy of current 8-bit clump + * @bits: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_set_clump8(start, clump, bits, size) \ + for ((start) = find_first_clump8(&(clump), (bits), (size)); \ + (start) < (size); \ + (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8)) + #endif /*__LINUX_FIND_H_ */ -- cgit v1.2.3 From 7516be9931b8bc8bcaac8531f490b42ab11ded1e Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:07 -0700 Subject: find: micro-optimize for_each_{set,clear}_bit() The macros iterate thru all set/clear bits in a bitmap. They search a first bit using find_first_bit(), and the rest bits using find_next_bit(). Since find_next_bit() is called shortly after find_first_bit(), we can save few lines of I-cache by not using find_first_bit(). Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- include/linux/find.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/find.h b/include/linux/find.h index 4500e8ab93e2..ae9ed52b52b8 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -280,7 +280,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned #endif #define for_each_set_bit(bit, addr, size) \ - for ((bit) = find_first_bit((addr), (size)); \ + for ((bit) = find_next_bit((addr), (size), 0); \ (bit) < (size); \ (bit) = find_next_bit((addr), (size), (bit) + 1)) @@ -291,7 +291,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned (bit) = find_next_bit((addr), (size), (bit) + 1)) #define for_each_clear_bit(bit, addr, size) \ - for ((bit) = find_first_zero_bit((addr), (size)); \ + for ((bit) = find_next_zero_bit((addr), (size), 0); \ (bit) < (size); \ (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) -- cgit v1.2.3 From ec288a2cf7ca40a939316b6df206ab845bb112d1 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:11 -0700 Subject: bitmap: unify find_bit operations bitmap_for_each_{set,clear}_region() are similar to for_each_bit() macros in include/linux/find.h, but interface and implementation of them are different. This patch adds for_each_bitrange() macros and drops unused bitmap_*_region() API in sake of unification. Signed-off-by: Yury Norov Tested-by: Wolfram Sang Acked-by: Dennis Zhou Acked-by: Ulf Hansson # For MMC --- include/linux/bitmap.h | 33 ----------------------------- include/linux/find.h | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index ead4a150bd7f..7dba0847510c 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -55,12 +55,6 @@ struct device; * bitmap_clear(dst, pos, nbits) Clear specified bit area * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area * bitmap_find_next_zero_area_off(buf, len, pos, n, mask, mask_off) as above - * bitmap_next_clear_region(map, &start, &end, nbits) Find next clear region - * bitmap_next_set_region(map, &start, &end, nbits) Find next set region - * bitmap_for_each_clear_region(map, rs, re, start, end) - * Iterate over all clear regions - * bitmap_for_each_set_region(map, rs, re, start, end) - * Iterate over all set regions * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n * bitmap_cut(dst, src, first, n, nbits) Cut n bits from first, copy rest @@ -467,14 +461,6 @@ static inline void bitmap_replace(unsigned long *dst, __bitmap_replace(dst, old, new, mask, nbits); } -static inline void bitmap_next_clear_region(unsigned long *bitmap, - unsigned int *rs, unsigned int *re, - unsigned int end) -{ - *rs = find_next_zero_bit(bitmap, end, *rs); - *re = find_next_bit(bitmap, end, *rs + 1); -} - static inline void bitmap_next_set_region(unsigned long *bitmap, unsigned int *rs, unsigned int *re, unsigned int end) @@ -483,25 +469,6 @@ static inline void bitmap_next_set_region(unsigned long *bitmap, *re = find_next_zero_bit(bitmap, end, *rs + 1); } -/* - * Bitmap region iterators. Iterates over the bitmap between [@start, @end). - * @rs and @re should be integer variables and will be set to start and end - * index of the current clear or set region. - */ -#define bitmap_for_each_clear_region(bitmap, rs, re, start, end) \ - for ((rs) = (start), \ - bitmap_next_clear_region((bitmap), &(rs), &(re), (end)); \ - (rs) < (re); \ - (rs) = (re) + 1, \ - bitmap_next_clear_region((bitmap), &(rs), &(re), (end))) - -#define bitmap_for_each_set_region(bitmap, rs, re, start, end) \ - for ((rs) = (start), \ - bitmap_next_set_region((bitmap), &(rs), &(re), (end)); \ - (rs) < (re); \ - (rs) = (re) + 1, \ - bitmap_next_set_region((bitmap), &(rs), &(re), (end))) - /** * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap. * @n: u64 value diff --git a/include/linux/find.h b/include/linux/find.h index ae9ed52b52b8..5bb6db213bcb 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -301,6 +301,62 @@ unsigned long find_next_bit_le(const void *addr, unsigned (bit) < (size); \ (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) +/** + * for_each_set_bitrange - iterate over all set bit ranges [b; e) + * @b: bit offset of start of current bitrange (first set bit) + * @e: bit offset of end of current bitrange (first unset bit) + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_set_bitrange(b, e, addr, size) \ + for ((b) = find_next_bit((addr), (size), 0), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1); \ + (b) < (size); \ + (b) = find_next_bit((addr), (size), (e) + 1), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1)) + +/** + * for_each_set_bitrange_from - iterate over all set bit ranges [b; e) + * @b: bit offset of start of current bitrange (first set bit); must be initialized + * @e: bit offset of end of current bitrange (first unset bit) + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_set_bitrange_from(b, e, addr, size) \ + for ((b) = find_next_bit((addr), (size), (b)), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1); \ + (b) < (size); \ + (b) = find_next_bit((addr), (size), (e) + 1), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1)) + +/** + * for_each_clear_bitrange - iterate over all unset bit ranges [b; e) + * @b: bit offset of start of current bitrange (first unset bit) + * @e: bit offset of end of current bitrange (first set bit) + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_clear_bitrange(b, e, addr, size) \ + for ((b) = find_next_zero_bit((addr), (size), 0), \ + (e) = find_next_bit((addr), (size), (b) + 1); \ + (b) < (size); \ + (b) = find_next_zero_bit((addr), (size), (e) + 1), \ + (e) = find_next_bit((addr), (size), (b) + 1)) + +/** + * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e) + * @b: bit offset of start of current bitrange (first set bit); must be initialized + * @e: bit offset of end of current bitrange (first unset bit) + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_clear_bitrange_from(b, e, addr, size) \ + for ((b) = find_next_zero_bit((addr), (size), (b)), \ + (e) = find_next_bit((addr), (size), (b) + 1); \ + (b) < (size); \ + (b) = find_next_zero_bit((addr), (size), (e) + 1), \ + (e) = find_next_bit((addr), (size), (b) + 1)) + /** * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits * @start: bit offset to start search and to store the current iteration offset -- cgit v1.2.3 From 7372971c1be5b7d4fdd8ad237798bdc1d1d54162 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 11 Jan 2022 10:19:22 +0300 Subject: rtc: mc146818-lib: fix signedness bug in mc146818_get_time() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mc146818_get_time() function returns zero on success or negative a error code on failure. It needs to be type int. Fixes: d35786b3a28d ("rtc: mc146818-lib: change return values of mc146818_get_time()") Signed-off-by: Dan Carpenter Reviewed-by: Mateusz Jończyk Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220111071922.GE11243@kili --- include/linux/mc146818rtc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h index 67fb0a12becc..808bb4cee230 100644 --- a/include/linux/mc146818rtc.h +++ b/include/linux/mc146818rtc.h @@ -124,7 +124,7 @@ struct cmos_rtc_board_info { #endif /* ARCH_RTC_LOCATION */ bool mc146818_does_rtc_work(void); -unsigned int mc146818_get_time(struct rtc_time *time); +int mc146818_get_time(struct rtc_time *time); int mc146818_set_time(struct rtc_time *time); bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param), -- cgit v1.2.3 From 3fe7fa5843d204e235d92902190fecb972a3f9cc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 10 Dec 2021 15:09:21 -0500 Subject: mm: Add folio_put_refs() This is like folio_put(), but puts N references at once instead of just one. It's like put_page_refs(), but does one atomic operation instead of two, and is available to more than just gup.c. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Reviewed-by: Jason Gunthorpe Reviewed-by: William Kucharski --- include/linux/mm.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c768a7c81b0b..cb98f75b245e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1244,6 +1244,26 @@ static inline void folio_put(struct folio *folio) __put_page(&folio->page); } +/** + * folio_put_refs - Reduce the reference count on a folio. + * @folio: The folio. + * @refs: The amount to subtract from the folio's reference count. + * + * If the folio's reference count reaches zero, the memory will be + * released back to the page allocator and may be used by another + * allocation immediately. Do not access the memory or the struct folio + * after calling folio_put_refs() unless you can be sure that these weren't + * the last references. + * + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. + */ +static inline void folio_put_refs(struct folio *folio, int refs) +{ + if (folio_ref_sub_and_test(folio, refs)) + __put_page(&folio->page); +} + static inline void put_page(struct page *page) { struct folio *folio = page_folio(page); -- cgit v1.2.3 From a6097180d884ddab769fb25588ea8598589c218c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 17 Jan 2022 09:07:26 +1100 Subject: devtmpfs regression fix: reconfigure on each mount Prior to Linux v5.4 devtmpfs used mount_single() which treats the given mount options as "remount" options, so it updates the configuration of the single super_block on each mount. Since that was changed, the mount options used for devtmpfs are ignored. This is a regression which affect systemd - which mounts devtmpfs with "-o mode=755,size=4m,nr_inodes=1m". This patch restores the "remount" effect by calling reconfigure_single() Fixes: d401727ea0d7 ("devtmpfs: don't mix {ramfs,shmem}_fill_super() with mount_single()") Acked-by: Christian Brauner Cc: Al Viro Signed-off-by: NeilBrown Signed-off-by: Linus Torvalds --- include/linux/fs_context.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 6b54982fc5f3..13fa6f3df8e4 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -142,6 +142,8 @@ extern void put_fs_context(struct fs_context *fc); extern int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param); extern void fc_drop_locked(struct fs_context *fc); +int reconfigure_single(struct super_block *s, + int flags, void *data); /* * sget() wrappers to be called from the ->get_tree() op. -- cgit v1.2.3 From be80a1d3f9dbe5aee79a325964f7037fe2d92f30 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 10 Jan 2022 14:05:49 +0000 Subject: bpf: Generalize check_ctx_reg for reuse with other types Generalize the check_ctx_reg() helper function into a more generic named one so that it can be reused for other register types as well to check whether their offset is non-zero. No functional change. Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 143401d4c9d9..e9993172f892 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -519,8 +519,8 @@ bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, void bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); -int check_ctx_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno); +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); -- cgit v1.2.3 From a672b2e36a648afb04ad3bda93b6bda947a479a5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 13 Jan 2022 11:11:30 +0000 Subject: bpf: Fix ringbuf memory type confusion when passing to helpers The bpf_ringbuf_submit() and bpf_ringbuf_discard() have ARG_PTR_TO_ALLOC_MEM in their bpf_func_proto definition as their first argument, and thus both expect the result from a prior bpf_ringbuf_reserve() call which has a return type of RET_PTR_TO_ALLOC_MEM_OR_NULL. While the non-NULL memory from bpf_ringbuf_reserve() can be passed to other helpers, the two sinks (bpf_ringbuf_submit(), bpf_ringbuf_discard()) right now only enforce a register type of PTR_TO_MEM. This can lead to potential type confusion since it would allow other PTR_TO_MEM memory to be passed into the two sinks which did not come from bpf_ringbuf_reserve(). Add a new MEM_ALLOC composable type attribute for PTR_TO_MEM, and enforce that: - bpf_ringbuf_reserve() returns NULL or PTR_TO_MEM | MEM_ALLOC - bpf_ringbuf_submit() and bpf_ringbuf_discard() only take PTR_TO_MEM | MEM_ALLOC but not plain PTR_TO_MEM arguments via ARG_PTR_TO_ALLOC_MEM - however, other helpers might treat PTR_TO_MEM | MEM_ALLOC as plain PTR_TO_MEM to populate the memory area when they use ARG_PTR_TO_{UNINIT_,}MEM in their func proto description Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov --- include/linux/bpf.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6e947cd91152..fa517ae604ad 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -316,7 +316,12 @@ enum bpf_type_flag { */ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = MEM_RDONLY, + /* MEM was "allocated" from a different helper, and cannot be mixed + * with regular non-MEM_ALLOC'ed MEM types. + */ + MEM_ALLOC = BIT(2 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = MEM_ALLOC, }; /* Max number of base types. */ @@ -400,7 +405,7 @@ enum bpf_return_type { RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, - RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM, + RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_ALLOC_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is -- cgit v1.2.3 From e6eec09b7bc7869a49ac0ff376415bad40030ade Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:15 +0000 Subject: KVM: Drop unused kvm_vcpu.pre_pcpu field Remove kvm_vcpu.pre_pcpu as it no longer has any users. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3c47b146851a..5c3c67b6318f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -309,7 +309,6 @@ struct kvm_vcpu { u64 requests; unsigned long guest_debug; - int pre_pcpu; struct list_head blocked_vcpu_list; struct mutex mutex; -- cgit v1.2.3 From 12a8eee5686ef3ea7d8db90cd664f11e4a39e349 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:16 +0000 Subject: KVM: Move x86 VMX's posted interrupt list_head to vcpu_vmx Move the seemingly generic block_vcpu_list from kvm_vcpu to vcpu_vmx, and rename the list and all associated variables to clarify that it tracks the set of vCPU that need to be poked on a posted interrupt to the wakeup vector. The list is not used to track _all_ vCPUs that are blocking, and the term "blocked" can be misleading as it may refer to a blocking condition in the host or the guest, where as the PI wakeup case is specifically for the vCPUs that are actively blocking from within the guest. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5c3c67b6318f..f079820f52b5 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -309,8 +309,6 @@ struct kvm_vcpu { u64 requests; unsigned long guest_debug; - struct list_head blocked_vcpu_list; - struct mutex mutex; struct kvm_run *run; -- cgit v1.2.3 From 1ca3fb3abd2b615c4b61728de545760a6e2c2d8b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 19 Jan 2022 18:07:45 -0800 Subject: mm: percpu: add pcpu_fc_cpu_to_node_fn_t typedef Add pcpu_fc_cpu_to_node_fn_t and pass it into pcpu_fc_alloc_fn_t, pcpu first chunk allocation will call it to alloc memblock on the corresponding node by it, this is prepare for the next patch. Link: https://lkml.kernel.org/r/20211216112359.103822-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Thomas Bogendoerfer Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Albert Ou Cc: Catalin Marinas Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/percpu.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index ae4004e7957e..e4078bf45fd5 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -94,8 +94,9 @@ extern const char * const pcpu_fc_names[PCPU_FC_NR]; extern enum pcpu_fc pcpu_chosen_fc; -typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, - size_t align); +typedef int (pcpu_fc_cpu_to_node_fn_t)(int cpu); +typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); @@ -111,12 +112,14 @@ extern void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, extern int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn); #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK extern int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); -- cgit v1.2.3 From 23f917169ef157aa7a6bf80d8c4aad6f1282852c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 19 Jan 2022 18:07:49 -0800 Subject: mm: percpu: add generic pcpu_fc_alloc/free funciton With the previous patch, we could add a generic pcpu first chunk allocate and free function to cleanup the duplicated definations on each architecture. Link: https://lkml.kernel.org/r/20211216112359.103822-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Thomas Bogendoerfer Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Greg Kroah-Hartman Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Albert Ou Cc: Catalin Marinas Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: "Rafael J. Wysocki" Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/percpu.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e4078bf45fd5..d73c97ef4ff4 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -95,9 +95,6 @@ extern const char * const pcpu_fc_names[PCPU_FC_NR]; extern enum pcpu_fc pcpu_chosen_fc; typedef int (pcpu_fc_cpu_to_node_fn_t)(int cpu); -typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, size_t align, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); -typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); @@ -112,16 +109,12 @@ extern void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, extern int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn); + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK extern int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); #endif -- cgit v1.2.3 From 20c035764626c56c4f6514936b9ee4be0f4cd962 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 19 Jan 2022 18:07:53 -0800 Subject: mm: percpu: add generic pcpu_populate_pte() function With NEED_PER_CPU_PAGE_FIRST_CHUNK enabled, we need a function to populate pte, this patch adds a generic pcpu populate pte function, pcpu_populate_pte(), which is marked __weak and used on most architectures, but it is overridden on x86, which has its own implementation. Link: https://lkml.kernel.org/r/20211216112359.103822-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Albert Ou Cc: Catalin Marinas Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Thomas Bogendoerfer Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/percpu.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index d73c97ef4ff4..f1ec5ad1351c 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -95,7 +95,6 @@ extern const char * const pcpu_fc_names[PCPU_FC_NR]; extern enum pcpu_fc pcpu_chosen_fc; typedef int (pcpu_fc_cpu_to_node_fn_t)(int cpu); -typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, @@ -113,9 +112,9 @@ extern int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK +void __init pcpu_populate_pte(unsigned long addr); extern int __init pcpu_page_first_chunk(size_t reserved_size, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, - pcpu_fc_populate_pte_fn_t populate_pte_fn); + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); #endif extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __alloc_size(1); -- cgit v1.2.3 From ae62fbe299629d3b2fa61d4cf5146258c4d99fdf Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 19 Jan 2022 18:08:00 -0800 Subject: proc: make the proc_create[_data]() stubs static inlines Change the proc_create[_data]() stubs which are used when CONFIG_PROC_FS is not set from #defines to a static inline stubs. This should fix clang -Werror builds failing due to errors like this: drivers/platform/x86/thinkpad_acpi.c:918:30: error: unused variable 'dispatch_proc_ops' [-Werror,-Wunused-const-variable] Fixing this in include/linux/proc_fs.h should ensure that the same issue is also fixed in any other drivers hitting the same -Werror issue. [akpm@linux-foundation.org: fix CONFIG_PROC_FS=n] [akpm@linux-foundation.org: fix arch/sparc/kernel/led.c] [akpm@linux-foundation.org: fix build] Link: https://lkml.kernel.org/r/20211116131112.508304-1-hdegoede@redhat.com Signed-off-by: Hans de Goede Reported-by: kernel test robot Acked-by: Christian Brauner Cc: Alexander Viro Cc: Hans de Goede Cc: David Howells Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/proc_fs.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 069c7fd95396..01b9268451a8 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -178,8 +178,16 @@ static inline struct proc_dir_entry *proc_mkdir_mode(const char *name, #define proc_create_seq(name, mode, parent, ops) ({NULL;}) #define proc_create_single(name, mode, parent, show) ({NULL;}) #define proc_create_single_data(name, mode, parent, show, data) ({NULL;}) -#define proc_create(name, mode, parent, proc_ops) ({NULL;}) -#define proc_create_data(name, mode, parent, proc_ops, data) ({NULL;}) + +static inline struct proc_dir_entry * +proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, + const struct proc_ops *proc_ops) +{ return NULL; } + +static inline struct proc_dir_entry * +proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, + const struct proc_ops *proc_ops, void *data) +{ return NULL; } static inline void proc_set_size(struct proc_dir_entry *de, loff_t size) {} static inline void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) {} -- cgit v1.2.3 From 22c033989c3eb9731ad0c497dfab4231b8e367d6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 19 Jan 2022 18:08:12 -0800 Subject: include/linux/unaligned: replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. The rest of the changes are induced by the above and may not be split. Link: https://lkml.kernel.org/r/20211209123823.20425-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Acked-by: Arend van Spriel [brcmfmac] Acked-by: Kalle Valo Cc: Arend van Spriel Cc: Franky Lin Cc: Hante Meuleman Cc: Chi-hsien Lin Cc: Wright Feng Cc: Chung-hsien Hsu Cc: Kalle Valo Cc: David S. Miller Cc: Jakub Kicinski Cc: Heikki Krogerus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/unaligned/packed_struct.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/unaligned/packed_struct.h b/include/linux/unaligned/packed_struct.h index c0d817de4df2..f4c8eaf4d012 100644 --- a/include/linux/unaligned/packed_struct.h +++ b/include/linux/unaligned/packed_struct.h @@ -1,7 +1,7 @@ #ifndef _LINUX_UNALIGNED_PACKED_STRUCT_H #define _LINUX_UNALIGNED_PACKED_STRUCT_H -#include +#include struct __una_u16 { u16 x; } __packed; struct __una_u32 { u32 x; } __packed; -- cgit v1.2.3 From 40cbf09f060c8febef64541c463d4dd526abe445 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 19 Jan 2022 18:08:16 -0800 Subject: kernel.h: include a note to discourage people from including it in headers Include a note at the top to discourage people from including it in headers. Link: https://lkml.kernel.org/r/20211209150803.4473-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 77755ac3e189..36a612d82956 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -1,4 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * NOTE: + * + * This header has combined a lot of unrelated to each other stuff. + * The process of splitting its content is in progress while keeping + * backward compatibility. That's why it's highly recommended NOT to + * include this header inside another header file, especially under + * generic or architectural include/ directory. + */ #ifndef _LINUX_KERNEL_H #define _LINUX_KERNEL_H -- cgit v1.2.3 From 95af469c4f609de011debc08e7a35b45201623a8 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 19 Jan 2022 18:08:29 -0800 Subject: fs/binfmt_elf: replace open-coded string copy with get_task_comm It is better to use get_task_comm() instead of the open coded string copy as we do in other places. struct elf_prpsinfo is used to dump the task information in userspace coredump or kernel vmcore. Below is the verification of vmcore, crash> ps PID PPID CPU TASK ST %MEM VSZ RSS COMM 0 0 0 ffffffff9d21a940 RU 0.0 0 0 [swapper/0] > 0 0 1 ffffa09e40f85e80 RU 0.0 0 0 [swapper/1] > 0 0 2 ffffa09e40f81f80 RU 0.0 0 0 [swapper/2] > 0 0 3 ffffa09e40f83f00 RU 0.0 0 0 [swapper/3] > 0 0 4 ffffa09e40f80000 RU 0.0 0 0 [swapper/4] > 0 0 5 ffffa09e40f89f80 RU 0.0 0 0 [swapper/5] 0 0 6 ffffa09e40f8bf00 RU 0.0 0 0 [swapper/6] > 0 0 7 ffffa09e40f88000 RU 0.0 0 0 [swapper/7] > 0 0 8 ffffa09e40f8de80 RU 0.0 0 0 [swapper/8] > 0 0 9 ffffa09e40f95e80 RU 0.0 0 0 [swapper/9] > 0 0 10 ffffa09e40f91f80 RU 0.0 0 0 [swapper/10] > 0 0 11 ffffa09e40f93f00 RU 0.0 0 0 [swapper/11] > 0 0 12 ffffa09e40f90000 RU 0.0 0 0 [swapper/12] > 0 0 13 ffffa09e40f9bf00 RU 0.0 0 0 [swapper/13] > 0 0 14 ffffa09e40f98000 RU 0.0 0 0 [swapper/14] > 0 0 15 ffffa09e40f9de80 RU 0.0 0 0 [swapper/15] It works well as expected. Some comments are added to explain why we use the hard-coded 16. Link: https://lkml.kernel.org/r/20211120112738.45980-5-laoar.shao@gmail.com Suggested-by: Kees Cook Signed-off-by: Yafang Shao Reviewed-by: David Hildenbrand Cc: Mathieu Desnoyers Cc: Arnaldo Carvalho de Melo Cc: Andrii Nakryiko Cc: Michal Miroslaw Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Al Viro Cc: Kees Cook Cc: Petr Mladek Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Dennis Dalessandro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/elfcore-compat.h | 5 +++++ include/linux/elfcore.h | 5 +++++ 2 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/elfcore-compat.h b/include/linux/elfcore-compat.h index e272c3d452ce..54feb64e9b5d 100644 --- a/include/linux/elfcore-compat.h +++ b/include/linux/elfcore-compat.h @@ -43,6 +43,11 @@ struct compat_elf_prpsinfo __compat_uid_t pr_uid; __compat_gid_t pr_gid; compat_pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid; + /* + * The hard-coded 16 is derived from TASK_COMM_LEN, but it can't be + * changed as it is exposed to userspace. We'd better make it hard-coded + * here. + */ char pr_fname[16]; char pr_psargs[ELF_PRARGSZ]; }; diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index 957ebec35aad..746e081879a5 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -65,6 +65,11 @@ struct elf_prpsinfo __kernel_gid_t pr_gid; pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid; /* Lots missing */ + /* + * The hard-coded 16 is derived from TASK_COMM_LEN, but it can't be + * changed as it is exposed to userspace. We'd better make it hard-coded + * here. + */ char pr_fname[16]; /* filename of executable */ char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */ }; -- cgit v1.2.3 From 3087c61ed2c48548b74dd343a5209b87082c682d Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 19 Jan 2022 18:08:40 -0800 Subject: tools/testing/selftests/bpf: replace open-coded 16 with TASK_COMM_LEN As the sched:sched_switch tracepoint args are derived from the kernel, we'd better make it same with the kernel. So the macro TASK_COMM_LEN is converted to type enum, then all the BPF programs can get it through BTF. The BPF program which wants to use TASK_COMM_LEN should include the header vmlinux.h. Regarding the test_stacktrace_map and test_tracepoint, as the type defined in linux/bpf.h are also defined in vmlinux.h, so we don't need to include linux/bpf.h again. Link: https://lkml.kernel.org/r/20211120112738.45980-8-laoar.shao@gmail.com Signed-off-by: Yafang Shao Acked-by: Andrii Nakryiko Acked-by: David Hildenbrand Cc: Mathieu Desnoyers Cc: Arnaldo Carvalho de Melo Cc: Andrii Nakryiko Cc: Michal Miroslaw Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Al Viro Cc: Kees Cook Cc: Petr Mladek Cc: Alexei Starovoitov Cc: Dennis Dalessandro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 78c351e35fec..cecd4806edc6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -274,8 +274,13 @@ struct task_group; #define get_current_state() READ_ONCE(current->__state) -/* Task command name length: */ -#define TASK_COMM_LEN 16 +/* + * Define the task command name length as enum, then it can be visible to + * BPF programs. + */ +enum { + TASK_COMM_LEN = 16, +}; extern void scheduler_tick(void); -- cgit v1.2.3 From d6986ce24fc00b0638bd29efe8fb7ba7619ed2aa Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 19 Jan 2022 18:08:43 -0800 Subject: kthread: dynamically allocate memory to store kthread's full name When I was implementing a new per-cpu kthread cfs_migration, I found the comm of it "cfs_migration/%u" is truncated due to the limitation of TASK_COMM_LEN. For example, the comm of the percpu thread on CPU10~19 all have the same name "cfs_migration/1", which will confuse the user. This issue is not critical, because we can get the corresponding CPU from the task's Cpus_allowed. But for kthreads corresponding to other hardware devices, it is not easy to get the detailed device info from task comm, for example, jbd2/nvme0n1p2- xfs-reclaim/sdf Currently there are so many truncated kthreads: rcu_tasks_kthre rcu_tasks_rude_ rcu_tasks_trace poll_mpt3sas0_s ext4-rsv-conver xfs-reclaim/sd{a, b, c, ...} xfs-blockgc/sd{a, b, c, ...} xfs-inodegc/sd{a, b, c, ...} audit_send_repl ecryptfs-kthrea vfio-irqfd-clea jbd2/nvme0n1p2- ... We can shorten these names to work around this problem, but it may be not applied to all of the truncated kthreads. Take 'jbd2/nvme0n1p2-' for example, it is a nice name, and it is not a good idea to shorten it. One possible way to fix this issue is extending the task comm size, but as task->comm is used in lots of places, that may cause some potential buffer overflows. Another more conservative approach is introducing a new pointer to store kthread's full name if it is truncated, which won't introduce too much overhead as it is in the non-critical path. Finally we make a dicision to use the second approach. See also the discussions in this thread: https://lore.kernel.org/lkml/20211101060419.4682-1-laoar.shao@gmail.com/ After this change, the full name of these truncated kthreads will be displayed via /proc/[pid]/comm: rcu_tasks_kthread rcu_tasks_rude_kthread rcu_tasks_trace_kthread poll_mpt3sas0_statu ext4-rsv-conversion xfs-reclaim/sdf1 xfs-blockgc/sdf1 xfs-inodegc/sdf1 audit_send_reply ecryptfs-kthread vfio-irqfd-cleanup jbd2/nvme0n1p2-8 Link: https://lkml.kernel.org/r/20211120112850.46047-1-laoar.shao@gmail.com Signed-off-by: Yafang Shao Reviewed-by: David Hildenbrand Reviewed-by: Petr Mladek Suggested-by: Petr Mladek Suggested-by: Steven Rostedt Cc: Mathieu Desnoyers Cc: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Michal Miroslaw Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Matthew Wilcox Cc: Al Viro Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kthread.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 346b0f269161..2a5c04494663 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -33,6 +33,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), unsigned int cpu, const char *namefmt); +void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk); void set_kthread_struct(struct task_struct *p); void kthread_set_per_cpu(struct task_struct *k, int cpu); -- cgit v1.2.3 From 0425473037db40d9e322631f2d4dc6ef51f97e88 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 19 Jan 2022 18:08:56 -0800 Subject: list: introduce list_is_head() helper and re-use it in list.h Introduce list_is_head() in the similar (*) way as it's done for list_entry_is_head(). Make use of it in the list.h. *) it's done as inliner and not a macro to be aligned with other list_is_*() APIs; while at it, make all three to have the same style. Link: https://lkml.kernel.org/r/20211201141824.81400-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Heikki Krogerus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list.h | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 6636fc07f918..dd6c2041d09c 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -258,8 +258,7 @@ static inline void list_bulk_move_tail(struct list_head *head, * @list: the entry to test * @head: the head of the list */ -static inline int list_is_first(const struct list_head *list, - const struct list_head *head) +static inline int list_is_first(const struct list_head *list, const struct list_head *head) { return list->prev == head; } @@ -269,12 +268,21 @@ static inline int list_is_first(const struct list_head *list, * @list: the entry to test * @head: the head of the list */ -static inline int list_is_last(const struct list_head *list, - const struct list_head *head) +static inline int list_is_last(const struct list_head *list, const struct list_head *head) { return list->next == head; } +/** + * list_is_head - tests whether @list is the list @head + * @list: the entry to test + * @head: the head of the list + */ +static inline int list_is_head(const struct list_head *list, const struct list_head *head) +{ + return list == head; +} + /** * list_empty - tests whether a list is empty * @head: the list to test. @@ -318,7 +326,7 @@ static inline void list_del_init_careful(struct list_head *entry) static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = smp_load_acquire(&head->next); - return (next == head) && (next == head->prev); + return list_is_head(next, head) && (next == head->prev); } /** @@ -393,10 +401,9 @@ static inline void list_cut_position(struct list_head *list, { if (list_empty(head)) return; - if (list_is_singular(head) && - (head->next != entry && head != entry)) + if (list_is_singular(head) && !list_is_head(entry, head) && (entry != head->next)) return; - if (entry == head) + if (list_is_head(entry, head)) INIT_LIST_HEAD(list); else __list_cut_position(list, head, entry); @@ -570,7 +577,7 @@ static inline void list_splice_tail_init(struct list_head *list, * @head: the head for your list. */ #define list_for_each(pos, head) \ - for (pos = (head)->next; pos != (head); pos = pos->next) + for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_continue - continue iteration over a list @@ -580,7 +587,7 @@ static inline void list_splice_tail_init(struct list_head *list, * Continue to iterate over a list, continuing after the current position. */ #define list_for_each_continue(pos, head) \ - for (pos = pos->next; pos != (head); pos = pos->next) + for (pos = pos->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_prev - iterate over a list backwards @@ -588,7 +595,7 @@ static inline void list_splice_tail_init(struct list_head *list, * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ - for (pos = (head)->prev; pos != (head); pos = pos->prev) + for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry @@ -597,8 +604,9 @@ static inline void list_splice_tail_init(struct list_head *list, * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ - for (pos = (head)->next, n = pos->next; pos != (head); \ - pos = n, n = pos->next) + for (pos = (head)->next, n = pos->next; \ + !list_is_head(pos, (head)); \ + pos = n, n = pos->next) /** * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry @@ -608,7 +616,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; \ - pos != (head); \ + !list_is_head(pos, (head)); \ pos = n, n = pos->prev) /** -- cgit v1.2.3 From fd0a1462405b087377e59b84e119fe7e2d08499a Mon Sep 17 00:00:00 2001 From: Isabella Basso Date: Wed, 19 Jan 2022 18:09:02 -0800 Subject: hash.h: remove unused define directive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "test_hash.c: refactor into KUnit", v3. We refactored the lib/test_hash.c file into KUnit as part of the student group LKCAMP [1] introductory hackathon for kernel development. This test was pointed to our group by Daniel Latypov [2], so its full conversion into a pure KUnit test was our goal in this patch series, but we ran into many problems relating to it not being split as unit tests, which complicated matters a bit, as the reasoning behind the original tests is quite cryptic for those unfamiliar with hash implementations. Some interesting developments we'd like to highlight are: - In patch 1/5 we noticed that there was an unused define directive that could be removed. - In patch 4/5 we noticed how stringhash and hash tests are all under the lib/test_hash.c file, which might cause some confusion, and we also broke those kernel config entries up. Overall KUnit developments have been made in the other patches in this series: In patches 2/5, 3/5 and 5/5 we refactored the lib/test_hash.c file so as to make it more compatible with the KUnit style, whilst preserving the original idea of the maintainer who designed it (i.e. George Spelvin), which might be undesirable for unit tests, but we assume it is enough for a first patch. This patch (of 5): Currently, there exist hash_32() and __hash_32() functions, which were introduced in a patch [1] targeting architecture specific optimizations. These functions can be overridden on a per-architecture basis to achieve such optimizations. They must set their corresponding define directive (HAVE_ARCH_HASH_32 and HAVE_ARCH__HASH_32, respectively) so that header files can deal with these overrides properly. As the supported 32-bit architectures that have their own hash function implementation (i.e. m68k, Microblaze, H8/300, pa-risc) have only been making use of the (more general) __hash_32() function (which only lacks a right shift operation when compared to the hash_32() function), remove the define directive corresponding to the arch-specific hash_32() implementation. [1] https://lore.kernel.org/lkml/20160525073311.5600.qmail@ns.sciencehorizons.net/ [akpm@linux-foundation.org: hash_32_generic() becomes hash_32()] Link: https://lkml.kernel.org/r/20211208183711.390454-1-isabbasso@riseup.net Link: https://lkml.kernel.org/r/20211208183711.390454-2-isabbasso@riseup.net Reviewed-by: David Gow Tested-by: David Gow Co-developed-by: Augusto Durães Camargo Signed-off-by: Augusto Durães Camargo Co-developed-by: Enzo Ferreira Signed-off-by: Enzo Ferreira Signed-off-by: Isabella Basso Cc: Geert Uytterhoeven Cc: Brendan Higgins Cc: Daniel Latypov Cc: Shuah Khan Cc: Rodrigo Siqueira Cc: kernel test robot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hash.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hash.h b/include/linux/hash.h index ad6fa21d977b..38edaa08f862 100644 --- a/include/linux/hash.h +++ b/include/linux/hash.h @@ -62,10 +62,7 @@ static inline u32 __hash_32_generic(u32 val) return val * GOLDEN_RATIO_32; } -#ifndef HAVE_ARCH_HASH_32 -#define hash_32 hash_32_generic -#endif -static inline u32 hash_32_generic(u32 val, unsigned int bits) +static inline u32 hash_32(u32 val, unsigned int bits) { /* High bits are more random, so use them. */ return __hash_32(val) >> (32 - bits); -- cgit v1.2.3 From a3d5dc908a5f572ce3e31fe83fd2459a1c3c5422 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 19 Jan 2022 18:10:02 -0800 Subject: delayacct: support swapin delay accounting for swapping without blkio Currently delayacct accounts swapin delay only for swapping that cause blkio. If we use zram for swapping, tools/accounting/getdelays can't get any SWAP delay. It's useful to get zram swapin delay information, for example to adjust compress algorithm or /proc/sys/vm/swappiness. Reference to PSI, it accounts any kind of swapping by doing its work in swap_readpage(), no matter whether swapping causes blkio. Let delayacct do the similar work. Link: https://lkml.kernel.org/r/20211112083813.8559-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Reported-by: Zeal Robot Cc: Balbir Singh Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index af7e6eb50283..b96d68f310a2 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -9,14 +9,6 @@ #include -/* - * Per-task flags relevant to delay accounting - * maintained privately to avoid exhausting similar flags in sched.h:PF_* - * Used to set current->delays->flags - */ -#define DELAYACCT_PF_SWAPIN 0x00000001 /* I am doing a swapin */ -#define DELAYACCT_PF_BLKIO 0x00000002 /* I am waiting on IO */ - #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info { raw_spinlock_t lock; @@ -37,13 +29,13 @@ struct task_delay_info { * associated with the operation is added to XXX_delay. * XXX_delay contains the accumulated delay time in nanoseconds. */ - u64 blkio_start; /* Shared by blkio, swapin */ + u64 blkio_start; u64 blkio_delay; /* wait for sync block io completion */ - u64 swapin_delay; /* wait for swapin block io completion */ + u64 swapin_start; + u64 swapin_delay; /* wait for swapin */ u32 blkio_count; /* total count of the number of sync block */ /* io operations performed */ - u32 swapin_count; /* total count of the number of swapin block */ - /* io operations performed */ + u32 swapin_count; /* total count of swapin */ u64 freepages_start; u64 freepages_delay; /* wait for memory reclaim */ @@ -79,14 +71,8 @@ extern void __delayacct_freepages_start(void); extern void __delayacct_freepages_end(void); extern void __delayacct_thrashing_start(void); extern void __delayacct_thrashing_end(void); - -static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) -{ - if (p->delays) - return (p->delays->flags & DELAYACCT_PF_BLKIO); - else - return 0; -} +extern void __delayacct_swapin_start(void); +extern void __delayacct_swapin_end(void); static inline void delayacct_set_flag(struct task_struct *p, int flag) { @@ -123,7 +109,6 @@ static inline void delayacct_blkio_start(void) if (!static_branch_unlikely(&delayacct_key)) return; - delayacct_set_flag(current, DELAYACCT_PF_BLKIO); if (current->delays) __delayacct_blkio_start(); } @@ -135,7 +120,6 @@ static inline void delayacct_blkio_end(struct task_struct *p) if (p->delays) __delayacct_blkio_end(p); - delayacct_clear_flag(p, DELAYACCT_PF_BLKIO); } static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) @@ -169,6 +153,18 @@ static inline void delayacct_thrashing_end(void) __delayacct_thrashing_end(); } +static inline void delayacct_swapin_start(void) +{ + if (current->delays) + __delayacct_swapin_start(); +} + +static inline void delayacct_swapin_end(void) +{ + if (current->delays) + __delayacct_swapin_end(); +} + #else static inline void delayacct_set_flag(struct task_struct *p, int flag) {} @@ -199,6 +195,10 @@ static inline void delayacct_thrashing_start(void) {} static inline void delayacct_thrashing_end(void) {} +static inline void delayacct_swapin_start(void) +{} +static inline void delayacct_swapin_end(void) +{} #endif /* CONFIG_TASK_DELAY_ACCT */ -- cgit v1.2.3 From 82065b7266899fbdce4c7394d7dd02688161f0cf Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 19 Jan 2022 18:10:06 -0800 Subject: delayacct: fix incomplete disable operation when switch enable to disable When a task is created after delayacct is enabled, kernel will do all the delay accountings for that task. The problems is if user disables delayacct by set /proc/sys/kernel/task_delayacct to zero, only blkio delay accounting is disabled. Now disable all the kinds of delay accountings when /proc/sys/kernel/task_delayacct sets to zero. Link: https://lkml.kernel.org/r/20211123140342.32962-1-ran.xiaokai@zte.com.cn Signed-off-by: Yang Yang Reported-by: Zeal Robot Cc: Balbir Singh Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index b96d68f310a2..c675cfb6437e 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -131,36 +131,54 @@ static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) static inline void delayacct_freepages_start(void) { + if (!static_branch_unlikely(&delayacct_key)) + return; + if (current->delays) __delayacct_freepages_start(); } static inline void delayacct_freepages_end(void) { + if (!static_branch_unlikely(&delayacct_key)) + return; + if (current->delays) __delayacct_freepages_end(); } static inline void delayacct_thrashing_start(void) { + if (!static_branch_unlikely(&delayacct_key)) + return; + if (current->delays) __delayacct_thrashing_start(); } static inline void delayacct_thrashing_end(void) { + if (!static_branch_unlikely(&delayacct_key)) + return; + if (current->delays) __delayacct_thrashing_end(); } static inline void delayacct_swapin_start(void) { + if (!static_branch_unlikely(&delayacct_key)) + return; + if (current->delays) __delayacct_swapin_start(); } static inline void delayacct_swapin_end(void) { + if (!static_branch_unlikely(&delayacct_key)) + return; + if (current->delays) __delayacct_swapin_end(); } -- cgit v1.2.3 From 1193829da1a6728249cd02577a020bd64fd9c160 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 19 Jan 2022 18:10:09 -0800 Subject: delayacct: cleanup flags in struct task_delay_info and functions use it Flags in struct task_delay_info is used to distinguish the difference between swapin and blkio delay acountings. But after patch "delayacct: support swapin delay accounting for swapping without blkio", there is no need to do that since swapin and blkio delay accounting use their own functions. Link: https://lkml.kernel.org/r/20211124065958.36703-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Cc: Balbir Singh Cc: Ingo Molnar Cc: Johannes Weiner Cc: Peter Zijlstra Cc: Zeal Robot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index c675cfb6437e..435c3654a0ff 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -12,7 +12,6 @@ #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info { raw_spinlock_t lock; - unsigned int flags; /* Private per-task flags */ /* For each stat XXX, add following, aligned appropriately * @@ -74,18 +73,6 @@ extern void __delayacct_thrashing_end(void); extern void __delayacct_swapin_start(void); extern void __delayacct_swapin_end(void); -static inline void delayacct_set_flag(struct task_struct *p, int flag) -{ - if (p->delays) - p->delays->flags |= flag; -} - -static inline void delayacct_clear_flag(struct task_struct *p, int flag) -{ - if (p->delays) - p->delays->flags &= ~flag; -} - static inline void delayacct_tsk_init(struct task_struct *tsk) { /* reinitialize in case parent's non-null pointer was dup'ed*/ @@ -184,10 +171,6 @@ static inline void delayacct_swapin_end(void) } #else -static inline void delayacct_set_flag(struct task_struct *p, int flag) -{} -static inline void delayacct_clear_flag(struct task_struct *p, int flag) -{} static inline void delayacct_init(void) {} static inline void delayacct_tsk_init(struct task_struct *tsk) -- cgit v1.2.3 From 5bf18281534451bf1ad56a45a3085cd7ad46860d Mon Sep 17 00:00:00 2001 From: wangyong Date: Wed, 19 Jan 2022 18:10:15 -0800 Subject: delayacct: track delays from memory compact Delay accounting does not track the delay of memory compact. When there is not enough free memory, tasks can spend a amount of their time waiting for compact. To get the impact of tasks in direct memory compact, measure the delay when allocating memory through memory compact. Also update tools/accounting/getdelays.c: / # ./getdelays_next -di -p 304 print delayacct stats ON printing IO accounting PID 304 CPU count real total virtual total delay total delay average 277 780000000 849039485 18877296 0.068ms IO count delay total delay average 0 0 0ms SWAP count delay total delay average 0 0 0ms RECLAIM count delay total delay average 5 11088812685 2217ms THRASHING count delay total delay average 0 0 0ms COMPACT count delay total delay average 3 72758 0ms watch: read=0, write=0, cancelled_write=0 Link: https://lkml.kernel.org/r/1638619795-71451-1-git-send-email-wang.yong12@zte.com.cn Signed-off-by: wangyong Reviewed-by: Jiang Xuexin Reviewed-by: Zhang Wenya Reviewed-by: Yang Yang Reviewed-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 435c3654a0ff..3e03d010bd2e 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -42,8 +42,12 @@ struct task_delay_info { u64 thrashing_start; u64 thrashing_delay; /* wait for thrashing page */ + u64 compact_start; + u64 compact_delay; /* wait for memory compact */ + u32 freepages_count; /* total count of memory reclaim */ u32 thrashing_count; /* total count of thrash waits */ + u32 compact_count; /* total count of memory compact */ }; #endif @@ -72,6 +76,8 @@ extern void __delayacct_thrashing_start(void); extern void __delayacct_thrashing_end(void); extern void __delayacct_swapin_start(void); extern void __delayacct_swapin_end(void); +extern void __delayacct_compact_start(void); +extern void __delayacct_compact_end(void); static inline void delayacct_tsk_init(struct task_struct *tsk) { @@ -170,6 +176,24 @@ static inline void delayacct_swapin_end(void) __delayacct_swapin_end(); } +static inline void delayacct_compact_start(void) +{ + if (!static_branch_unlikely(&delayacct_key)) + return; + + if (current->delays) + __delayacct_compact_start(); +} + +static inline void delayacct_compact_end(void) +{ + if (!static_branch_unlikely(&delayacct_key)) + return; + + if (current->delays) + __delayacct_compact_end(); +} + #else static inline void delayacct_init(void) {} @@ -200,6 +224,10 @@ static inline void delayacct_swapin_start(void) {} static inline void delayacct_swapin_end(void) {} +static inline void delayacct_compact_start(void) +{} +static inline void delayacct_compact_end(void) +{} #endif /* CONFIG_TASK_DELAY_ACCT */ -- cgit v1.2.3 From 66a8f7f04979f4ad739085f01d99c8caf620b4f5 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 18 Jan 2022 18:35:02 +0100 Subject: of: base: make small of_parse_phandle() variants static inline Make all the smaller variants of the of_parse_phandle() static inline. This also let us remove the empty function stubs if CONFIG_OF is not defined. Suggested-by: Rob Herring Signed-off-by: Michael Walle [robh: move index < 0 check into __of_parse_phandle_with_args] Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220118173504.2867523-2-michael@walle.cc --- include/linux/of.h | 148 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 120 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index ff143a027abc..16d76c92fbe0 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -364,18 +364,12 @@ extern const struct of_device_id *of_match_node( const struct of_device_id *matches, const struct device_node *node); extern int of_modalias_node(struct device_node *node, char *modalias, int len); extern void of_print_phandle_args(const char *msg, const struct of_phandle_args *args); -extern struct device_node *of_parse_phandle(const struct device_node *np, - const char *phandle_name, - int index); -extern int of_parse_phandle_with_args(const struct device_node *np, - const char *list_name, const char *cells_name, int index, - struct of_phandle_args *out_args); +extern int __of_parse_phandle_with_args(const struct device_node *np, + const char *list_name, const char *cells_name, int cell_count, + int index, struct of_phandle_args *out_args); extern int of_parse_phandle_with_args_map(const struct device_node *np, const char *list_name, const char *stem_name, int index, struct of_phandle_args *out_args); -extern int of_parse_phandle_with_fixed_args(const struct device_node *np, - const char *list_name, int cells_count, int index, - struct of_phandle_args *out_args); extern int of_count_phandle_with_args(const struct device_node *np, const char *list_name, const char *cells_name); @@ -865,18 +859,12 @@ static inline int of_property_read_string_helper(const struct device_node *np, return -ENOSYS; } -static inline struct device_node *of_parse_phandle(const struct device_node *np, - const char *phandle_name, - int index) -{ - return NULL; -} - -static inline int of_parse_phandle_with_args(const struct device_node *np, - const char *list_name, - const char *cells_name, - int index, - struct of_phandle_args *out_args) +static inline int __of_parse_phandle_with_args(const struct device_node *np, + const char *list_name, + const char *cells_name, + int cell_count, + int index, + struct of_phandle_args *out_args) { return -ENOSYS; } @@ -890,13 +878,6 @@ static inline int of_parse_phandle_with_args_map(const struct device_node *np, return -ENOSYS; } -static inline int of_parse_phandle_with_fixed_args(const struct device_node *np, - const char *list_name, int cells_count, int index, - struct of_phandle_args *out_args) -{ - return -ENOSYS; -} - static inline int of_count_phandle_with_args(const struct device_node *np, const char *list_name, const char *cells_name) @@ -1077,6 +1058,117 @@ static inline bool of_node_is_type(const struct device_node *np, const char *typ return np && match && type && !strcmp(match, type); } +/** + * of_parse_phandle - Resolve a phandle property to a device_node pointer + * @np: Pointer to device node holding phandle property + * @phandle_name: Name of property holding a phandle value + * @index: For properties holding a table of phandles, this is the index into + * the table + * + * Return: The device_node pointer with refcount incremented. Use + * of_node_put() on it when done. + */ +static inline struct device_node *of_parse_phandle(const struct device_node *np, + const char *phandle_name, + int index) +{ + struct of_phandle_args args; + + if (__of_parse_phandle_with_args(np, phandle_name, NULL, 0, + index, &args)) + return NULL; + + return args.np; +} + +/** + * of_parse_phandle_with_args() - Find a node pointed by phandle in a list + * @np: pointer to a device tree node containing a list + * @list_name: property name that contains a list + * @cells_name: property name that specifies phandles' arguments count + * @index: index of a phandle to parse out + * @out_args: optional pointer to output arguments structure (will be filled) + * + * This function is useful to parse lists of phandles and their arguments. + * Returns 0 on success and fills out_args, on error returns appropriate + * errno value. + * + * Caller is responsible to call of_node_put() on the returned out_args->np + * pointer. + * + * Example:: + * + * phandle1: node1 { + * #list-cells = <2>; + * }; + * + * phandle2: node2 { + * #list-cells = <1>; + * }; + * + * node3 { + * list = <&phandle1 1 2 &phandle2 3>; + * }; + * + * To get a device_node of the ``node2`` node you may call this: + * of_parse_phandle_with_args(node3, "list", "#list-cells", 1, &args); + */ +static inline int of_parse_phandle_with_args(const struct device_node *np, + const char *list_name, + const char *cells_name, + int index, + struct of_phandle_args *out_args) +{ + int cell_count = -1; + + /* If cells_name is NULL we assume a cell count of 0 */ + if (!cells_name) + cell_count = 0; + + return __of_parse_phandle_with_args(np, list_name, cells_name, + cell_count, index, out_args); +} + +/** + * of_parse_phandle_with_fixed_args() - Find a node pointed by phandle in a list + * @np: pointer to a device tree node containing a list + * @list_name: property name that contains a list + * @cell_count: number of argument cells following the phandle + * @index: index of a phandle to parse out + * @out_args: optional pointer to output arguments structure (will be filled) + * + * This function is useful to parse lists of phandles and their arguments. + * Returns 0 on success and fills out_args, on error returns appropriate + * errno value. + * + * Caller is responsible to call of_node_put() on the returned out_args->np + * pointer. + * + * Example:: + * + * phandle1: node1 { + * }; + * + * phandle2: node2 { + * }; + * + * node3 { + * list = <&phandle1 0 2 &phandle2 2 3>; + * }; + * + * To get a device_node of the ``node2`` node you may call this: + * of_parse_phandle_with_fixed_args(node3, "list", 2, 1, &args); + */ +static inline int of_parse_phandle_with_fixed_args(const struct device_node *np, + const char *list_name, + int cell_count, + int index, + struct of_phandle_args *out_args) +{ + return __of_parse_phandle_with_args(np, list_name, NULL, cell_count, + index, out_args); +} + /** * of_property_count_u8_elems - Count the number of u8 elements in a property * -- cgit v1.2.3 From 2ca42c3ad9ed875b136065b010753a4caaaa1d38 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 18 Jan 2022 18:35:03 +0100 Subject: of: property: define of_property_read_u{8,16,32,64}_array() unconditionally We can get rid of all the empty stubs because all these functions call of_property_read_variable_u{8,16,32,64}_array() which already have an empty stub if CONFIG_OF is not defined. Signed-off-by: Michael Walle Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220118173504.2867523-3-michael@walle.cc --- include/linux/of.h | 274 ++++++++++++++++++++++++----------------------------- 1 file changed, 124 insertions(+), 150 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 16d76c92fbe0..2dc77430a91a 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -410,130 +410,6 @@ extern int of_detach_node(struct device_node *); #define of_match_ptr(_ptr) (_ptr) -/** - * of_property_read_u8_array - Find and read an array of u8 from a property. - * - * @np: device node from which the property value is to be read. - * @propname: name of the property to be searched. - * @out_values: pointer to return value, modified only if return value is 0. - * @sz: number of array elements to read - * - * Search for a property in a device node and read 8-bit value(s) from - * it. - * - * dts entry of array should be like: - * ``property = /bits/ 8 <0x50 0x60 0x70>;`` - * - * Return: 0 on success, -EINVAL if the property does not exist, - * -ENODATA if property does not have a value, and -EOVERFLOW if the - * property data isn't large enough. - * - * The out_values is modified only if a valid u8 value can be decoded. - */ -static inline int of_property_read_u8_array(const struct device_node *np, - const char *propname, - u8 *out_values, size_t sz) -{ - int ret = of_property_read_variable_u8_array(np, propname, out_values, - sz, 0); - if (ret >= 0) - return 0; - else - return ret; -} - -/** - * of_property_read_u16_array - Find and read an array of u16 from a property. - * - * @np: device node from which the property value is to be read. - * @propname: name of the property to be searched. - * @out_values: pointer to return value, modified only if return value is 0. - * @sz: number of array elements to read - * - * Search for a property in a device node and read 16-bit value(s) from - * it. - * - * dts entry of array should be like: - * ``property = /bits/ 16 <0x5000 0x6000 0x7000>;`` - * - * Return: 0 on success, -EINVAL if the property does not exist, - * -ENODATA if property does not have a value, and -EOVERFLOW if the - * property data isn't large enough. - * - * The out_values is modified only if a valid u16 value can be decoded. - */ -static inline int of_property_read_u16_array(const struct device_node *np, - const char *propname, - u16 *out_values, size_t sz) -{ - int ret = of_property_read_variable_u16_array(np, propname, out_values, - sz, 0); - if (ret >= 0) - return 0; - else - return ret; -} - -/** - * of_property_read_u32_array - Find and read an array of 32 bit integers - * from a property. - * - * @np: device node from which the property value is to be read. - * @propname: name of the property to be searched. - * @out_values: pointer to return value, modified only if return value is 0. - * @sz: number of array elements to read - * - * Search for a property in a device node and read 32-bit value(s) from - * it. - * - * Return: 0 on success, -EINVAL if the property does not exist, - * -ENODATA if property does not have a value, and -EOVERFLOW if the - * property data isn't large enough. - * - * The out_values is modified only if a valid u32 value can be decoded. - */ -static inline int of_property_read_u32_array(const struct device_node *np, - const char *propname, - u32 *out_values, size_t sz) -{ - int ret = of_property_read_variable_u32_array(np, propname, out_values, - sz, 0); - if (ret >= 0) - return 0; - else - return ret; -} - -/** - * of_property_read_u64_array - Find and read an array of 64 bit integers - * from a property. - * - * @np: device node from which the property value is to be read. - * @propname: name of the property to be searched. - * @out_values: pointer to return value, modified only if return value is 0. - * @sz: number of array elements to read - * - * Search for a property in a device node and read 64-bit value(s) from - * it. - * - * Return: 0 on success, -EINVAL if the property does not exist, - * -ENODATA if property does not have a value, and -EOVERFLOW if the - * property data isn't large enough. - * - * The out_values is modified only if a valid u64 value can be decoded. - */ -static inline int of_property_read_u64_array(const struct device_node *np, - const char *propname, - u64 *out_values, size_t sz) -{ - int ret = of_property_read_variable_u64_array(np, propname, out_values, - sz, 0); - if (ret >= 0) - return 0; - else - return ret; -} - /* * struct property *prop; * const __be32 *p; @@ -728,32 +604,6 @@ static inline int of_property_count_elems_of_size(const struct device_node *np, return -ENOSYS; } -static inline int of_property_read_u8_array(const struct device_node *np, - const char *propname, u8 *out_values, size_t sz) -{ - return -ENOSYS; -} - -static inline int of_property_read_u16_array(const struct device_node *np, - const char *propname, u16 *out_values, size_t sz) -{ - return -ENOSYS; -} - -static inline int of_property_read_u32_array(const struct device_node *np, - const char *propname, - u32 *out_values, size_t sz) -{ - return -ENOSYS; -} - -static inline int of_property_read_u64_array(const struct device_node *np, - const char *propname, - u64 *out_values, size_t sz) -{ - return -ENOSYS; -} - static inline int of_property_read_u32_index(const struct device_node *np, const char *propname, u32 index, u32 *out_value) { @@ -1328,6 +1178,130 @@ static inline bool of_property_read_bool(const struct device_node *np, return prop ? true : false; } +/** + * of_property_read_u8_array - Find and read an array of u8 from a property. + * + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * @out_values: pointer to return value, modified only if return value is 0. + * @sz: number of array elements to read + * + * Search for a property in a device node and read 8-bit value(s) from + * it. + * + * dts entry of array should be like: + * ``property = /bits/ 8 <0x50 0x60 0x70>;`` + * + * Return: 0 on success, -EINVAL if the property does not exist, + * -ENODATA if property does not have a value, and -EOVERFLOW if the + * property data isn't large enough. + * + * The out_values is modified only if a valid u8 value can be decoded. + */ +static inline int of_property_read_u8_array(const struct device_node *np, + const char *propname, + u8 *out_values, size_t sz) +{ + int ret = of_property_read_variable_u8_array(np, propname, out_values, + sz, 0); + if (ret >= 0) + return 0; + else + return ret; +} + +/** + * of_property_read_u16_array - Find and read an array of u16 from a property. + * + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * @out_values: pointer to return value, modified only if return value is 0. + * @sz: number of array elements to read + * + * Search for a property in a device node and read 16-bit value(s) from + * it. + * + * dts entry of array should be like: + * ``property = /bits/ 16 <0x5000 0x6000 0x7000>;`` + * + * Return: 0 on success, -EINVAL if the property does not exist, + * -ENODATA if property does not have a value, and -EOVERFLOW if the + * property data isn't large enough. + * + * The out_values is modified only if a valid u16 value can be decoded. + */ +static inline int of_property_read_u16_array(const struct device_node *np, + const char *propname, + u16 *out_values, size_t sz) +{ + int ret = of_property_read_variable_u16_array(np, propname, out_values, + sz, 0); + if (ret >= 0) + return 0; + else + return ret; +} + +/** + * of_property_read_u32_array - Find and read an array of 32 bit integers + * from a property. + * + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * @out_values: pointer to return value, modified only if return value is 0. + * @sz: number of array elements to read + * + * Search for a property in a device node and read 32-bit value(s) from + * it. + * + * Return: 0 on success, -EINVAL if the property does not exist, + * -ENODATA if property does not have a value, and -EOVERFLOW if the + * property data isn't large enough. + * + * The out_values is modified only if a valid u32 value can be decoded. + */ +static inline int of_property_read_u32_array(const struct device_node *np, + const char *propname, + u32 *out_values, size_t sz) +{ + int ret = of_property_read_variable_u32_array(np, propname, out_values, + sz, 0); + if (ret >= 0) + return 0; + else + return ret; +} + +/** + * of_property_read_u64_array - Find and read an array of 64 bit integers + * from a property. + * + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * @out_values: pointer to return value, modified only if return value is 0. + * @sz: number of array elements to read + * + * Search for a property in a device node and read 64-bit value(s) from + * it. + * + * Return: 0 on success, -EINVAL if the property does not exist, + * -ENODATA if property does not have a value, and -EOVERFLOW if the + * property data isn't large enough. + * + * The out_values is modified only if a valid u64 value can be decoded. + */ +static inline int of_property_read_u64_array(const struct device_node *np, + const char *propname, + u64 *out_values, size_t sz) +{ + int ret = of_property_read_variable_u64_array(np, propname, out_values, + sz, 0); + if (ret >= 0) + return 0; + else + return ret; +} + static inline int of_property_read_u8(const struct device_node *np, const char *propname, u8 *out_value) -- cgit v1.2.3 From c522e3ad296b7b692ed3960dfde467f2a34b434f Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 7 Jan 2022 09:28:41 +0000 Subject: fscache: Add a comment explaining how page-release optimisation works Add a comment into fscache_note_page_release() to explain how the page-release optimisation logic works[1]. It's not entirely obvious as it has nothing to do with whether or not the netfs file contains data. FSCACHE_COOKIE_NO_DATA_TO_READ is set if we have no data in the cache yet (ie. the backing file lookup was negative, the file is 0 length or the cookie got invalidated). It means that we have no data in the cache, not that the file is necessarily empty on the server. FSCACHE_COOKIE_HAVE_DATA is set once we've stored data in the backing file. From that point on, we have data we *could* read - however, it's covered by pages in the netfs pagecache until at such time one of those covering pages is released. So if we've written data to the cache (HAVE_DATA) and there wasn't any data in the cache when we started (NO_DATA_TO_READ), it may no longer be true that we can skip reading from the cache. Read skipping is done by cachefiles_prepare_read(). Note that tracking is not done on a per-page basis, but only on a per-file basis. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/043a206f03929c2667a465314144e518070a9b2d.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/164251408479.3435901.9540165422908194636.stgit@warthog.procyon.org.uk/ # v1 --- include/linux/fscache.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index ede50406bcb0..296c5f1d9f35 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -665,6 +665,11 @@ static inline void fscache_clear_inode_writeback(struct fscache_cookie *cookie, static inline void fscache_note_page_release(struct fscache_cookie *cookie) { + /* If we've written data to the cache (HAVE_DATA) and there wasn't any + * data in the cache when we started (NO_DATA_TO_READ), it may no + * longer be true that we can skip reading from the cache - so clear + * the flag that causes reads to be skipped. + */ if (cookie && test_bit(FSCACHE_COOKIE_HAVE_DATA, &cookie->flags) && test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) -- cgit v1.2.3 From ffa65753c43142f3b803486442813744da71cff2 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 21 Jan 2022 22:10:46 -0800 Subject: mm/migrate.c: rework migration_entry_wait() to not take a pageref This fixes the FIXME in migrate_vma_check_page(). Before migrating a page migration code will take a reference and check there are no unexpected page references, failing the migration if there are. When a thread faults on a migration entry it will take a temporary reference to the page to wait for the page to become unlocked signifying the migration entry has been removed. This reference is dropped just prior to waiting on the page lock, however the extra reference can cause migration failures so it is desirable to avoid taking it. As migration code already has a reference to the migrating page an extra reference to wait on PG_locked is unnecessary so long as the reference can't be dropped whilst setting up the wait. When faulting on a migration entry the ptl is taken to check the migration entry. Removing a migration entry also requires the ptl, and migration code won't drop its page reference until after the migration entry has been removed. Therefore retaining the ptl of a migration entry is sufficient to ensure the page has a reference. Reworking migration_entry_wait() to hold the ptl until the wait setup is complete means the extra page reference is no longer needed. [apopple@nvidia.com: v5] Link: https://lkml.kernel.org/r/20211213033848.1973946-1-apopple@nvidia.com Link: https://lkml.kernel.org/r/20211118020754.954425-1-apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 4850cc5bf813..db96e10eb8da 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -40,6 +40,8 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); extern int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, int extra_count); +void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, + spinlock_t *ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); void folio_migrate_copy(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, -- cgit v1.2.3 From 3ddd9a808cee7284931312f2f3e854c9617f44b2 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:10:50 -0800 Subject: sysctl: add a new register_sysctl_init() interface Patch series "sysctl: first set of kernel/sysctl cleanups", v2. Finally had time to respin the series of the work we had started last year on cleaning up the kernel/sysct.c kitchen sink. People keeps stuffing their sysctls in that file and this creates a maintenance burden. So this effort is aimed at placing sysctls where they actually belong. I'm going to split patches up into series as there is quite a bit of work. This first set adds register_sysctl_init() for uses of registerting a sysctl on the init path, adds const where missing to a few places, generalizes common values so to be more easy to share, and starts the move of a few kernel/sysctl.c out where they belong. The majority of rework on v2 in this first patch set is 0-day fixes. Eric Biederman's feedback is later addressed in subsequent patch sets. I'll only post the first two patch sets for now. We can address the rest once the first two patch sets get completely reviewed / Acked. This patch (of 9): The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. Today though folks heavily rely on tables on kernel/sysctl.c so they can easily just extend this table with their needed sysctls. In order to help users move their sysctls out we need to provide a helper which can be used during code initialization. We special-case the initialization use of register_sysctl() since it *is* safe to fail, given all that sysctls do is provide a dynamic interface to query or modify at runtime an existing variable. So the use case of register_sysctl() on init should *not* stop if the sysctls don't end up getting registered. It would be counter productive to stop boot if a simple sysctl registration failed. Provide a helper for init then, and document the recommended init levels to use for callers of this routine. We will later use this in subsequent patches to start slimming down kernel/sysctl.c tables and moving sysctl registration to the code which actually needs these sysctls. [mcgrof@kernel.org: major commit log and documentation rephrasing also moved to fs/proc/proc_sysctl.c ] Link: https://lkml.kernel.org/r/20211123202347.818157-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211123202347.818157-2-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Kees Cook Cc: Iurii Zaikin Cc: "Eric W. Biederman" Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Cc: Paul Turner Cc: Andy Shevchenko Cc: Sebastian Reichel Cc: Tetsuo Handa Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Qing Wang Cc: Benjamin LaHaise Cc: Al Viro Cc: Jan Kara Cc: Amir Goldstein Cc: Stephen Kitt Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sysctl.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 1fa2b69c6fc3..d3ab7969b6b5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -199,6 +199,9 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init(void); +extern void __register_sysctl_init(const char *path, struct ctl_table *table, + const char *table_name); +#define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table) void do_sysctl_args(void); extern int pwrsw_enabled; -- cgit v1.2.3 From 78e36f3b0dae586f623c4a37ec5eb5496f5abbe1 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:10:55 -0800 Subject: sysctl: move some boundary constants from sysctl.c to sysctl_vals sysctl has helpers which let us specify boundary values for a min or max int value. Since these are used for a boundary check only they don't change, so move these variables to sysctl_vals to avoid adding duplicate variables. This will help with our cleanup of kernel/sysctl.c. [akpm@linux-foundation.org: update it for "mm/pagealloc: sysctl: change watermark_scale_factor max limit to 30%"] [mcgrof@kernel.org: major rebase] Link: https://lkml.kernel.org/r/20211123202347.818157-3-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Kees Cook Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sysctl.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index d3ab7969b6b5..47cf70c8eb93 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -38,9 +38,16 @@ struct ctl_table_header; struct ctl_dir; /* Keep the same order as in fs/proc/proc_sysctl.c */ -#define SYSCTL_ZERO ((void *)&sysctl_vals[0]) -#define SYSCTL_ONE ((void *)&sysctl_vals[1]) -#define SYSCTL_INT_MAX ((void *)&sysctl_vals[2]) +#define SYSCTL_NEG_ONE ((void *)&sysctl_vals[0]) +#define SYSCTL_ZERO ((void *)&sysctl_vals[1]) +#define SYSCTL_ONE ((void *)&sysctl_vals[2]) +#define SYSCTL_TWO ((void *)&sysctl_vals[3]) +#define SYSCTL_FOUR ((void *)&sysctl_vals[4]) +#define SYSCTL_ONE_HUNDRED ((void *)&sysctl_vals[5]) +#define SYSCTL_TWO_HUNDRED ((void *)&sysctl_vals[6]) +#define SYSCTL_ONE_THOUSAND ((void *)&sysctl_vals[7]) +#define SYSCTL_THREE_THOUSAND ((void *)&sysctl_vals[8]) +#define SYSCTL_INT_MAX ((void *)&sysctl_vals[9]) extern const int sysctl_vals[]; -- cgit v1.2.3 From bbe7a10ed83a5fa0b0ff6161ecdc4e65a0e9c993 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:00 -0800 Subject: hung_task: move hung_task sysctl interface to hung_task.c The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move hung_task sysctl interface to hung_task.c and use register_sysctl() to register the sysctl interface. [mcgrof@kernel.org: commit log refresh and fixed 2-3 0day reported compile issues] Link: https://lkml.kernel.org/r/20211123202347.818157-4-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Paul Turner Cc: Peter Zijlstra Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/sysctl.h | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 304f431178fd..c19dd5a2c05c 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -7,20 +7,8 @@ struct ctl_table; #ifdef CONFIG_DETECT_HUNG_TASK - -#ifdef CONFIG_SMP -extern unsigned int sysctl_hung_task_all_cpu_backtrace; -#else -#define sysctl_hung_task_all_cpu_backtrace 0 -#endif /* CONFIG_SMP */ - -extern int sysctl_hung_task_check_count; -extern unsigned int sysctl_hung_task_panic; +/* used for hung_task and block/ */ extern unsigned long sysctl_hung_task_timeout_secs; -extern unsigned long sysctl_hung_task_check_interval_secs; -extern int sysctl_hung_task_warnings; -int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); #else /* Avoid need for ifdefs elsewhere in the code */ enum { sysctl_hung_task_timeout_secs = 0 }; -- cgit v1.2.3 From 86b12b6c5d6b46e64bf2e8080528781032e4bd90 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:24 -0800 Subject: aio: move aio sysctl to aio.c The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. Move aio sysctl to aio.c and use the new register_sysctl_init() to register the sysctl interface for aio. [mcgrof@kernel.org: adjust commit log to justify the move] Link: https://lkml.kernel.org/r/20211123202347.818157-9-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Jan Kara Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Kees Cook Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/aio.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/aio.h b/include/linux/aio.h index b83e68dd006f..86892a4fe7c8 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -20,8 +20,4 @@ static inline void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) { } #endif /* CONFIG_AIO */ -/* for sysctl: */ -extern unsigned long aio_nr; -extern unsigned long aio_max_nr; - #endif /* __LINUX__AIO_H */ -- cgit v1.2.3 From 49a4de75719b6c0f1f375df9908a95cef1e34945 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:29 -0800 Subject: dnotify: move dnotify sysctl to dnotify.c The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move dnotify sysctls to dnotify.c and use the new register_sysctl_init() to register the sysctl interface. [mcgrof@kernel.org: adjust the commit log to justify the move] Link: https://lkml.kernel.org/r/20211123202347.818157-10-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Acked-by: Jan Kara Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Kees Cook Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dnotify.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h index b87c3b85a166..b1d26f9f1c9f 100644 --- a/include/linux/dnotify.h +++ b/include/linux/dnotify.h @@ -29,7 +29,6 @@ struct dnotify_struct { FS_CREATE | FS_RENAME |\ FS_MOVED_FROM | FS_MOVED_TO) -extern int dir_notify_enable; extern void dnotify_flush(struct file *, fl_owner_t); extern int fcntl_dirnotify(int, struct file *, unsigned long); -- cgit v1.2.3 From 7b9ad122b52c9839e1f68f16c907990a6ad6f793 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:59 -0800 Subject: inotify: simplify subdirectory registration with register_sysctl() There is no need to user boiler plate code to specify a set of base directories we're going to stuff sysctls under. Simplify this by using register_sysctl() and specifying the directory path directly. Move inotify_user sysctl to inotify_user.c while at it to remove clutter from kernel/sysctl.c. [mcgrof@kernel.org: remember to register fanotify_table] Link: https://lkml.kernel.org/r/YZ5A6iWLb0h3N3RC@bombadil.infradead.org [mcgrof@kernel.org: update commit log to reflect new path we decided to take] Link: https://lkml.kernel.org/r/20211123202422.819032-7-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fanotify.h | 2 -- include/linux/inotify.h | 3 --- 2 files changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 3afdf339d53c..419cadcd7ff5 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -5,8 +5,6 @@ #include #include -extern struct ctl_table fanotify_table[]; /* for sysctl */ - #define FAN_GROUP_FLAG(group, flag) \ ((group)->fanotify_data.flags & (flag)) diff --git a/include/linux/inotify.h b/include/linux/inotify.h index 6a24905f6e1e..8d20caa1b268 100644 --- a/include/linux/inotify.h +++ b/include/linux/inotify.h @@ -7,11 +7,8 @@ #ifndef _LINUX_INOTIFY_H #define _LINUX_INOTIFY_H -#include #include -extern struct ctl_table inotify_table[]; /* for sysctl */ - #define ALL_INOTIFY_BITS (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \ IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | \ IN_MOVED_TO | IN_CREATE | IN_DELETE | \ -- cgit v1.2.3 From a8f5de894f76f1c73f4a068d04897a5e2f873825 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:09 -0800 Subject: eventpoll: simplify sysctl declaration with register_sysctl() The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the epoll_table sysctl to fs/eventpoll.c and use register_sysctl(). Link: https://lkml.kernel.org/r/20211123202422.819032-9-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/poll.h | 2 -- include/linux/sysctl.h | 1 - 2 files changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/poll.h b/include/linux/poll.h index 1cdc32b1f1b0..a9e0e1c2d1f2 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -8,12 +8,10 @@ #include #include #include -#include #include #include #include -extern struct ctl_table epoll_table[]; /* for sysctl */ /* ~832 bytes of stack space used max in sys_select/sys_poll before allocating additional memory. */ #ifdef __clang__ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 47cf70c8eb93..6dd0f277f844 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -219,7 +219,6 @@ extern int no_unaligned_warning; extern struct ctl_table sysctl_mount_point[]; extern struct ctl_table random_table[]; extern struct ctl_table firmware_config_table[]; -extern struct ctl_table epoll_table[]; #else /* CONFIG_SYSCTL */ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) -- cgit v1.2.3 From 6aad36d421d8bfe156508fa4edfe67827234cf0f Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:13 -0800 Subject: firmware_loader: move firmware sysctl to its own files Patch series "sysctl: 3rd set of kernel/sysctl cleanups", v2. This is the third set of patches to help address cleaning the kitchen seink in kernel/sysctl.c and to move sysctls away to where they are actually implemented / used. This patch (of 8): kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the firmware configuration sysctl table to the only place where it is used, and make it clear that if sysctls are disabled this is not used. [akpm@linux-foundation.org: export register_firmware_config_sysctl and unregister_firmware_config_sysctl to modules] [akpm@linux-foundation.org: use EXPORT_SYMBOL_NS_GPL instead] [sfr@canb.auug.org.au: fix that so it compiles] Link: https://lkml.kernel.org/r/20211201160626.401d828d@canb.auug.org.au [mcgrof@kernel.org: major commit log update to justify the move] Link: https://lkml.kernel.org/r/20211124231435.1445213-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211124231435.1445213-2-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Signed-off-by: Stephen Rothwell Cc: Kees Cook Cc: Iurii Zaikin Cc: Eric Biederman Cc: Stephen Kitt Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: "Theodore Ts'o" Cc: Al Viro Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Steven Rostedt (VMware) Cc: John Ogness Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Martin K. Petersen Cc: Lukas Middendorf Cc: Antti Palosaari Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Suren Baghdasaryan Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sysctl.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 6dd0f277f844..3985e9c80155 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -218,7 +218,6 @@ extern int no_unaligned_warning; extern struct ctl_table sysctl_mount_point[]; extern struct ctl_table random_table[]; -extern struct ctl_table firmware_config_table[]; #else /* CONFIG_SYSCTL */ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) -- cgit v1.2.3 From 5475e8f03c80bbce7b43a57d861f5acc44a60b22 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:18 -0800 Subject: random: move the random sysctl declarations to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the random sysctls to their own file and use register_sysctl_init(). [mcgrof@kernel.org: commit log update to justify the move] Link: https://lkml.kernel.org/r/20211124231435.1445213-3-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sysctl.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 3985e9c80155..fce05a060bc5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -217,7 +217,6 @@ extern int unaligned_dump_stack; extern int no_unaligned_warning; extern struct ctl_table sysctl_mount_point[]; -extern struct ctl_table random_table[]; #else /* CONFIG_SYSCTL */ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) -- cgit v1.2.3 From ee9efac48a082904d17a20131aa73d82f058cdd6 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:23 -0800 Subject: sysctl: add helper to register a sysctl mount point The way to create a subdirectory on top of sysctl_mount_point is a bit obscure, and *why* we do that even so more. Provide a helper which makes it clear why we do this. [akpm@linux-foundation.org: export register_sysctl_mount_point() to modules] Link: https://lkml.kernel.org/r/20211124231435.1445213-4-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Suggested-by: "Eric W. Biederman" Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sysctl.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index fce05a060bc5..746c098a6ff5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -209,6 +209,8 @@ extern int sysctl_init(void); extern void __register_sysctl_init(const char *path, struct ctl_table *table, const char *table_name); #define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table) +extern struct ctl_table_header *register_sysctl_mount_point(const char *path); + void do_sysctl_args(void); extern int pwrsw_enabled; @@ -224,6 +226,11 @@ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * return NULL; } +static inline struct sysctl_header *register_sysctl_mount_point(const char *path) +{ + return NULL; +} + static inline struct ctl_table_header *register_sysctl_paths( const struct ctl_path *path, struct ctl_table *table) { -- cgit v1.2.3 From 0df8bdd5e3b3e557ce2c2575fce0c64c5dd1045a Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:43 -0800 Subject: stackleak: move stack_erasing sysctl to stackleak.c kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the stack_erasing sysctl from kernel/sysctl.c to kernel/stackleak.c and use register_sysctl() to register the sysctl interface. [mcgrof@kernel.org: commit log update] Link: https://lkml.kernel.org/r/20211124231435.1445213-8-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/stackleak.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h index a59db2f08e76..ccaab2043fcd 100644 --- a/include/linux/stackleak.h +++ b/include/linux/stackleak.h @@ -23,11 +23,6 @@ static inline void stackleak_task_init(struct task_struct *t) # endif } -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE -int stack_erasing_sysctl(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -#endif - #else /* !CONFIG_GCC_PLUGIN_STACKLEAK */ static inline void stackleak_task_init(struct task_struct *t) { } #endif -- cgit v1.2.3 From b1f2aff888af54a057c2c3c0d88a13ef5d37b52a Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:48 -0800 Subject: sysctl: share unsigned long const values Provide a way to share unsigned long values. This will allow others to not have to re-invent these values. Link: https://lkml.kernel.org/r/20211124231435.1445213-9-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sysctl.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 746c098a6ff5..2de6d20d191b 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -51,6 +51,12 @@ struct ctl_dir; extern const int sysctl_vals[]; +#define SYSCTL_LONG_ZERO ((void *)&sysctl_long_vals[0]) +#define SYSCTL_LONG_ONE ((void *)&sysctl_long_vals[1]) +#define SYSCTL_LONG_MAX ((void *)&sysctl_long_vals[2]) + +extern const unsigned long sysctl_long_vals[]; + typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); -- cgit v1.2.3 From 1d67fe585049d3e2448b997af78c68cbf90ada09 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:52 -0800 Subject: fs: move inode sysctls to its own file Patch series "sysctl: 4th set of kernel/sysctl cleanups". This is slimming down the fs uses of kernel/sysctl.c to the point that the next step is to just get rid of the fs base directory for it and move that elsehwere, so that next patch series starts dealing with that to demo how we can end up cleaning up a full base directory from kernel/sysctl.c, one at a time. This patch (of 9): kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the inode sysctls to its own file. Since we are no longer using this outside of fs/ remove the extern declaration of its respective proc helper. We use early_initcall() as it is the earliest we can use. [arnd@arndb.de: avoid unused-variable warning] Link: https://lkml.kernel.org/r/20211203190123.874239-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20211129205548.605569-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211129205548.605569-2-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Kees Cook Cc: Iurii Zaikin Cc: Xiaoming Ni Cc: Eric Biederman Cc: Stephen Kitt Cc: Lukas Middendorf Cc: Antti Palosaari Cc: Andy Shevchenko Cc: Jeff Layton Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c8510da6cc6d..044de67c8167 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -82,7 +82,6 @@ extern void __init files_maxfiles_init(void); extern struct files_stat_struct files_stat; extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; -extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; extern int sysctl_protected_symlinks; extern int sysctl_protected_hardlinks; @@ -3537,8 +3536,6 @@ int proc_nr_files(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int proc_nr_inodes(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) -- cgit v1.2.3 From 204d5a24e15562b2816825c0f9b49d26814b77be Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:56 -0800 Subject: fs: move fs stat sysctls to file_table.c kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. We can create the sysctl dynamically on early init for fs stat to help with this clutter. This dusts off the fs stat syctls knobs and puts them into where they are declared. Link: https://lkml.kernel.org/r/20211129205548.605569-3-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 044de67c8167..1e6761966120 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -79,7 +79,6 @@ extern void __init inode_init_early(void); extern void __init files_init(void); extern void __init files_maxfiles_init(void); -extern struct files_stat_struct files_stat; extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; extern int leases_enable, lease_break_time; @@ -3532,8 +3531,6 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); struct ctl_table; -int proc_nr_files(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int __init list_bdev_fs_names(char *buf, size_t size); -- cgit v1.2.3 From c8c0c239d5ab1e3e8d2bb0453ce642fe2c6357ec Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:59 -0800 Subject: fs: move dcache sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the dcache sysctl clutter out of kernel/sysctl.c. This is a small one-off entry, perhaps later we can simplify this representation, but for now we use the helpers we have. We won't know how we can simplify this further untl we're fully done with the cleanup. [arnd@arndb.de: avoid unused-function warning] Link: https://lkml.kernel.org/r/20211203190123.874239-2-arnd@kernel.org Link: https://lkml.kernel.org/r/20211129205548.605569-4-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dcache.h | 10 ---------- include/linux/fs.h | 2 -- 2 files changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 9e23d33bb6f1..f5bba51480b2 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -61,16 +61,6 @@ extern const struct qstr empty_name; extern const struct qstr slash_name; extern const struct qstr dotdot_name; -struct dentry_stat_t { - long nr_dentry; - long nr_unused; - long age_limit; /* age in seconds */ - long want_pages; /* pages requested by system */ - long nr_negative; /* # of unused negative dentries */ - long dummy; /* Reserved for future use */ -}; -extern struct dentry_stat_t dentry_stat; - /* * Try to keep struct dentry aligned on 64 byte cachelines (this will * give reasonable cacheline footprint with larger lines without the diff --git a/include/linux/fs.h b/include/linux/fs.h index 1e6761966120..9b856d5da9e2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3531,8 +3531,6 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); struct ctl_table; -int proc_nr_dentry(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) -- cgit v1.2.3 From 54771613e8a7dbbba2a205ddf1b33e25a290b3fd Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:03 -0800 Subject: sysctl: move maxolduid as a sysctl specific const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The maxolduid value is only shared for sysctl purposes for use on a max range. Just stuff this into our shared const array. [akpm@linux-foundation.org: fix sysctl_vals[], per Mickaël] Link: https://lkml.kernel.org/r/20211129205548.605569-5-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Signed-off-by: Mickaël Salaün Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sysctl.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 2de6d20d191b..bb921eb8a02d 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -49,6 +49,9 @@ struct ctl_dir; #define SYSCTL_THREE_THOUSAND ((void *)&sysctl_vals[8]) #define SYSCTL_INT_MAX ((void *)&sysctl_vals[9]) +/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ +#define SYSCTL_MAXOLDUID ((void *)&sysctl_vals[10]) + extern const int sysctl_vals[]; #define SYSCTL_LONG_ZERO ((void *)&sysctl_long_vals[0]) -- cgit v1.2.3 From dd81faa88340a1fe8cd81c8ecbadd8e95c58549c Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:10 -0800 Subject: fs: move locking sysctls where they are used kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. The locking fs sysctls are only used on fs/locks.c, so move them there. Link: https://lkml.kernel.org/r/20211129205548.605569-7-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9b856d5da9e2..0e08c3dd8f75 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -82,10 +82,6 @@ extern void __init files_maxfiles_init(void); extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; extern int leases_enable, lease_break_time; -extern int sysctl_protected_symlinks; -extern int sysctl_protected_hardlinks; -extern int sysctl_protected_fifos; -extern int sysctl_protected_regular; typedef __kernel_rwf_t rwf_t; -- cgit v1.2.3 From 9c011be132972ff94bde2ae99064e29f94e85c68 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:13 -0800 Subject: fs: move namei sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move namei's own sysctl knobs to its own file. Other than the move we also avoid initializing two static variables to 0 as this is not needed: * sysctl_protected_symlinks * sysctl_protected_hardlinks Link: https://lkml.kernel.org/r/20211129205548.605569-8-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0e08c3dd8f75..9617dea24978 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -81,7 +81,6 @@ extern void __init files_maxfiles_init(void); extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; -extern int leases_enable, lease_break_time; typedef __kernel_rwf_t rwf_t; -- cgit v1.2.3 From 1998f19324d24df7de4e74d81503b4299eb99e7d Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:20 -0800 Subject: fs: move pipe sysctls to is own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the pipe sysctls to its own file. Link: https://lkml.kernel.org/r/20211129205548.605569-10-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 4 ---- include/linux/sysctl.h | 6 ++++++ 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index fc5642431b92..c00c618ef290 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -238,10 +238,6 @@ void pipe_lock(struct pipe_inode_info *); void pipe_unlock(struct pipe_inode_info *); void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); -extern unsigned int pipe_max_size; -extern unsigned long pipe_user_pages_hard; -extern unsigned long pipe_user_pages_soft; - /* Wait for a pipe to be readable/writable while dropping the pipe lock */ void pipe_wait_readable(struct pipe_inode_info *); void pipe_wait_writable(struct pipe_inode_info *); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index bb921eb8a02d..4294e9668bd5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -221,6 +221,12 @@ extern void __register_sysctl_init(const char *path, struct ctl_table *table, extern struct ctl_table_header *register_sysctl_mount_point(const char *path); void do_sysctl_args(void); +int do_proc_douintvec(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data); extern int pwrsw_enabled; extern int unaligned_enabled; -- cgit v1.2.3 From 51cb8dfc5a5c39e6c70376b9dc9a14d624a9d271 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:24 -0800 Subject: sysctl: add and use base directory declarer and registration helper Patch series "sysctl: add and use base directory declarer and registration helper". In this patch series we start addressing base directories, and so we start with the "fs" sysctls. The end goal is we end up completely moving all "fs" sysctl knobs out from kernel/sysctl. This patch (of 6): Add a set of helpers which can be used to declare and register base directory sysctls on their own. We do this so we can later move each of the base sysctl directories like "fs", "kernel", etc, to their own respective files instead of shoving the declarations and registrations all on kernel/sysctl.c. The lazy approach has caught up and with this, we just end up extending the list of base directories / sysctls on one file and this makes maintenance difficult due to merge conflicts from many developers. The declarations are used first by kernel/sysctl.c for registration its own base which over time we'll try to clean up. It will be used in the next patch to demonstrate how to cleanly deal with base sysctl directories. [mcgrof@kernel.org: null-terminate the ctl_table arrays] Link: https://lkml.kernel.org/r/YafJY3rXDYnjK/gs@bombadil.infradead.org Link: https://lkml.kernel.org/r/20211129211943.640266-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211129211943.640266-2-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Kees Cook Cc: Iurii Zaikin Cc: Xiaoming Ni Cc: Eric Biederman Cc: Stephen Kitt Cc: Lukas Middendorf Cc: Antti Palosaari Cc: Christian Brauner Cc: Eric Biggers Cc: "Naveen N. Rao" Cc: "David S. Miller" Cc: Masami Hiramatsu Cc: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sysctl.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 4294e9668bd5..02134a8abad7 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -194,6 +194,20 @@ struct ctl_path { #ifdef CONFIG_SYSCTL +#define DECLARE_SYSCTL_BASE(_name, _table) \ +static struct ctl_table _name##_base_table[] = { \ + { \ + .procname = #_name, \ + .mode = 0555, \ + .child = _table, \ + }, \ + { }, \ +} + +extern int __register_sysctl_base(struct ctl_table *base_table); + +#define register_sysctl_base(_name) __register_sysctl_base(_name##_base_table) + void proc_sys_poll_notify(struct ctl_table_poll *poll); extern void setup_sysctl_set(struct ctl_table_set *p, @@ -236,6 +250,16 @@ extern int no_unaligned_warning; extern struct ctl_table sysctl_mount_point[]; #else /* CONFIG_SYSCTL */ + +#define DECLARE_SYSCTL_BASE(_name, _table) + +static inline int __register_sysctl_base(struct ctl_table *base_table) +{ + return 0; +} + +#define register_sysctl_base(table) __register_sysctl_base(table) + static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) { return NULL; -- cgit v1.2.3 From ab171b952c6e065779687b44041038efdadb3915 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:27 -0800 Subject: fs: move namespace sysctls and declare fs base directory This moves the namespace sysctls to its own file as part of the kernel/sysctl.c spring cleaning Since we have now removed all sysctls for "fs", we now have to declare it on the filesystem code, we do that using the new helper, which reduces boiler plate code. We rename init_fs_shared_sysctls() to init_fs_sysctls() to reflect that now fs/sysctls.c is taking on the burden of being the first to register the base directory as well. Lastly, since init code will load in the order in which we link it we have to move the sysctl code to be linked in early, so that its early init routine runs prior to other fs code. This way, other filesystem code can register their own sysctls using the helpers after this: * register_sysctl_init() * register_sysctl() Link: https://lkml.kernel.org/r/20211129211943.640266-3-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mount.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mount.h b/include/linux/mount.h index 5d92a7e1a742..7f18a7555dff 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -113,9 +113,6 @@ extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list); extern void mark_mounts_for_expiry(struct list_head *mounts); extern dev_t name_to_dev_t(const char *name); - -extern unsigned int sysctl_mount_max; - extern bool path_is_mountpoint(const struct path *path); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); -- cgit v1.2.3 From d8c0418aac78e661b5283c9d6a1dfc61d44f26fd Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:31 -0800 Subject: kernel/sysctl.c: rename sysctl_init() to sysctl_init_bases() Rename sysctl_init() to sysctl_init_bases() so to reflect exactly what this is doing. Link: https://lkml.kernel.org/r/20211129211943.640266-4-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sysctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 02134a8abad7..180adf7da785 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -228,7 +228,7 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); -extern int sysctl_init(void); +extern int sysctl_init_bases(void); extern void __register_sysctl_init(const char *path, struct ctl_table *table, const char *table_name); #define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table) -- cgit v1.2.3 From fdcd4073fccc6f989308be3f1d61d8a68cd990ce Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:13:34 -0800 Subject: printk: fix build warning when CONFIG_PRINTK=n build warning when CONFIG_PRINTK=n kernel/printk/printk.c:175:5: warning: no previous prototype for 'devkmsg_sysctl_set_loglvl' [-Wmissing-prototypes] devkmsg_sysctl_set_loglvl() is only used in sysctl.c when CONFIG_PRINTK=y, but it participates in the build when CONFIG_PRINTK=n. So add compile dependency CONFIG_PRINTK=y && CONFIG_SYSCTL=y to fix the build warning. Link: https://lkml.kernel.org/r/20211129211943.640266-5-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 9497f6b98339..1522df223c0f 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -183,10 +183,6 @@ extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, extern int printk_delay_msec; extern int dmesg_restrict; -extern int -devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buf, - size_t *lenp, loff_t *ppos); - extern void wake_up_klogd(void); char *log_buf_addr_get(void); -- cgit v1.2.3 From f0bc21b268c1464603192a00851cdbbf7c2cdc36 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:13:38 -0800 Subject: fs/coredump: move coredump sysctls into its own file This moves the fs/coredump.c respective sysctls to its own file. Link: https://lkml.kernel.org/r/20211129211943.640266-6-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/coredump.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 78fcd776b185..248a68c668b4 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -14,10 +14,6 @@ struct core_vma_metadata { unsigned long dump_size; }; -extern int core_uses_pid; -extern char core_pattern[]; -extern unsigned int core_pipe_limit; - /* * These are the only things you should do on a core-file: use only these * functions to write out all the necessary info. @@ -37,4 +33,10 @@ extern void do_coredump(const kernel_siginfo_t *siginfo); static inline void do_coredump(const kernel_siginfo_t *siginfo) {} #endif +#if defined(CONFIG_COREDUMP) && defined(CONFIG_SYSCTL) +extern void validate_coredump_safety(void); +#else +static inline void validate_coredump_safety(void) {} +#endif + #endif /* _LINUX_COREDUMP_H */ -- cgit v1.2.3 From a737a3c6744bc822d1e6a837fef550e665ddf877 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:13:41 -0800 Subject: kprobe: move sysctl_kprobes_optimization to kprobes.c kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. Move sysctl_kprobes_optimization from kernel/sysctl.c to kernel/kprobes.c. Use register_sysctl() to register the sysctl interface. [mcgrof@kernel.org: fix compile issue when CONFIG_OPTPROBES is disabled] Link: https://lkml.kernel.org/r/20211129211943.640266-7-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 8c8f7a4d93af..19b884353b15 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -348,12 +348,6 @@ extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs); DEFINE_INSN_CACHE_OPS(optinsn); -#ifdef CONFIG_SYSCTL -extern int sysctl_kprobes_optimization; -extern int proc_kprobes_optimization_handler(struct ctl_table *table, - int write, void *buffer, - size_t *length, loff_t *ppos); -#endif /* CONFIG_SYSCTL */ extern void wait_for_kprobe_optimizer(void); #else /* !CONFIG_OPTPROBES */ static inline void wait_for_kprobe_optimizer(void) { } -- cgit v1.2.3 From 4a57d6bbaecd28c8175dc5da013009e4158018c2 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 21 Jan 2022 22:14:10 -0800 Subject: locking/rwlocks: introduce write_lock_nested In preparation for converting bit_spin_lock to rwlock in zsmalloc so that multiple writers of zspages can run at the same time but those zspages are supposed to be different zspage instance. Thus, it's not deadlock. This patch adds write_lock_nested to support the case for LOCKDEP. [minchan@kernel.org: fix write_lock_nested for RT] Link: https://lkml.kernel.org/r/YZfrMTAXV56HFWJY@google.com [bigeasy@linutronix.de: fixup write_lock_nested() implementation] Link: https://lkml.kernel.org/r/20211123170134.y6xb7pmpgdn4m3bn@linutronix.de Link: https://lkml.kernel.org/r/20211115185909.3949505-8-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Sebastian Andrzej Siewior Acked-by: Peter Zijlstra (Intel) Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: Mike Galbraith Cc: Sergey Senozhatsky Cc: Thomas Gleixner Cc: Naresh Kamboju Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rwlock.h | 6 ++++++ include/linux/rwlock_api_smp.h | 8 ++++++++ include/linux/rwlock_rt.h | 10 ++++++++++ include/linux/spinlock_api_up.h | 1 + 4 files changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 2c0ad417ce3c..8f416c5e929e 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -55,6 +55,12 @@ do { \ #define write_lock(lock) _raw_write_lock(lock) #define read_lock(lock) _raw_read_lock(lock) +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#define write_lock_nested(lock, subclass) _raw_write_lock_nested(lock, subclass) +#else +#define write_lock_nested(lock, subclass) _raw_write_lock(lock) +#endif + #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) #define read_lock_irqsave(lock, flags) \ diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index f1db6f17c4fb..dceb0a59b692 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -17,6 +17,7 @@ void __lockfunc _raw_read_lock(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_write_lock(rwlock_t *lock) __acquires(lock); +void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) __acquires(lock); void __lockfunc _raw_read_lock_bh(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_write_lock_bh(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_read_lock_irq(rwlock_t *lock) __acquires(lock); @@ -209,6 +210,13 @@ static inline void __raw_write_lock(rwlock_t *lock) LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); } +static inline void __raw_write_lock_nested(rwlock_t *lock, int subclass) +{ + preempt_disable(); + rwlock_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); +} + #endif /* !CONFIG_GENERIC_LOCKBREAK || CONFIG_DEBUG_LOCK_ALLOC */ static inline void __raw_write_unlock(rwlock_t *lock) diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h index 49c1f3842ed5..8544ff05e594 100644 --- a/include/linux/rwlock_rt.h +++ b/include/linux/rwlock_rt.h @@ -28,6 +28,7 @@ extern void rt_read_lock(rwlock_t *rwlock); extern int rt_read_trylock(rwlock_t *rwlock); extern void rt_read_unlock(rwlock_t *rwlock); extern void rt_write_lock(rwlock_t *rwlock); +extern void rt_write_lock_nested(rwlock_t *rwlock, int subclass); extern int rt_write_trylock(rwlock_t *rwlock); extern void rt_write_unlock(rwlock_t *rwlock); @@ -83,6 +84,15 @@ static __always_inline void write_lock(rwlock_t *rwlock) rt_write_lock(rwlock); } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static __always_inline void write_lock_nested(rwlock_t *rwlock, int subclass) +{ + rt_write_lock_nested(rwlock, subclass); +} +#else +#define write_lock_nested(lock, subclass) rt_write_lock(((void)(subclass), (lock))) +#endif + static __always_inline void write_lock_bh(rwlock_t *rwlock) { local_bh_disable(); diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h index d0d188861ad6..b8ba00ccccde 100644 --- a/include/linux/spinlock_api_up.h +++ b/include/linux/spinlock_api_up.h @@ -59,6 +59,7 @@ #define _raw_spin_lock_nested(lock, subclass) __LOCK(lock) #define _raw_read_lock(lock) __LOCK(lock) #define _raw_write_lock(lock) __LOCK(lock) +#define _raw_write_lock_nested(lock, subclass) __LOCK(lock) #define _raw_spin_lock_bh(lock) __LOCK_BH(lock) #define _raw_read_lock_bh(lock) __LOCK_BH(lock) #define _raw_write_lock_bh(lock) __LOCK_BH(lock) -- cgit v1.2.3 From 6dfbbae14a7b961f41d80a106e1ab60e86d061c5 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 21 Jan 2022 22:14:20 -0800 Subject: fs: proc: store PDE()->data into inode->i_private PDE_DATA(inode) is introduced to get user private data and hide the layout of struct proc_dir_entry. The inode->i_private is used to do the same thing as well. Save a copy of user private data to inode-> i_private when proc inode is allocated. This means the user also can get their private data by inode->i_private. Introduce pde_data() to wrap inode->i_private so that we can remove PDE_DATA() from fs/proc/generic.c and make PTE_DATE() as a wrapper of pde_data(). It will be easier if we decide to remove PDE_DATE() in the future. Link: https://lkml.kernel.org/r/20211124081956.87711-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Christian Brauner Cc: Alexey Dobriyan Cc: Alexey Gladkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/proc_fs.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 01b9268451a8..b6e7005cc1b2 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -110,7 +110,18 @@ extern struct proc_dir_entry *proc_create_data(const char *, umode_t, struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops); extern void proc_set_size(struct proc_dir_entry *, loff_t); extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t); -extern void *PDE_DATA(const struct inode *); + +/* + * Obtain the private data passed by user through proc_create_data() or + * related. + */ +static inline void *pde_data(const struct inode *inode) +{ + return inode->i_private; +} + +#define PDE_DATA(i) pde_data(i) + extern void *proc_get_parent_data(const struct inode *); extern void proc_remove(struct proc_dir_entry *); extern void remove_proc_entry(const char *, struct proc_dir_entry *); -- cgit v1.2.3 From 359745d78351c6f5442435f81549f0207ece28aa Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 21 Jan 2022 22:14:23 -0800 Subject: proc: remove PDE_DATA() completely Remove PDE_DATA() completely and replace it with pde_data(). [akpm@linux-foundation.org: fix naming clash in drivers/nubus/proc.c] [akpm@linux-foundation.org: now fix it properly] Link: https://lkml.kernel.org/r/20211124081956.87711-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Christian Brauner Cc: Alexey Dobriyan Cc: Alexey Gladkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/proc_fs.h | 4 +--- include/linux/seq_file.h | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index b6e7005cc1b2..81d6e4ec2294 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -120,8 +120,6 @@ static inline void *pde_data(const struct inode *inode) return inode->i_private; } -#define PDE_DATA(i) pde_data(i) - extern void *proc_get_parent_data(const struct inode *); extern void proc_remove(struct proc_dir_entry *); extern void remove_proc_entry(const char *, struct proc_dir_entry *); @@ -202,7 +200,7 @@ proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, static inline void proc_set_size(struct proc_dir_entry *de, loff_t size) {} static inline void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) {} -static inline void *PDE_DATA(const struct inode *inode) {BUG(); return NULL;} +static inline void *pde_data(const struct inode *inode) {BUG(); return NULL;} static inline void *proc_get_parent_data(const struct inode *inode) { BUG(); return NULL; } static inline void proc_remove(struct proc_dir_entry *de) {} diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 72dbb44a4573..88cc16444b43 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -209,7 +209,7 @@ static const struct file_operations __name ## _fops = { \ #define DEFINE_PROC_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ - return single_open(file, __name ## _show, PDE_DATA(inode)); \ + return single_open(file, __name ## _show, pde_data(inode)); \ } \ \ static const struct proc_ops __name ## _proc_ops = { \ -- cgit v1.2.3 From 2dba5eb1c73b6ba2988ced07250edeac0f8cbf5a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 21 Jan 2022 22:14:27 -0800 Subject: lib/stackdepot: allow optional init and stack_table allocation by kvmalloc() Currently, enabling CONFIG_STACKDEPOT means its stack_table will be allocated from memblock, even if stack depot ends up not actually used. The default size of stack_table is 4MB on 32-bit, 8MB on 64-bit. This is fine for use-cases such as KASAN which is also a config option and has overhead on its own. But it's an issue for functionality that has to be actually enabled on boot (page_owner) or depends on hardware (GPU drivers) and thus the memory might be wasted. This was raised as an issue [1] when attempting to add stackdepot support for SLUB's debug object tracking functionality. It's common to build kernels with CONFIG_SLUB_DEBUG and enable slub_debug on boot only when needed, or create only specific kmem caches with debugging for testing purposes. It would thus be more efficient if stackdepot's table was allocated only when actually going to be used. This patch thus makes the allocation (and whole stack_depot_init() call) optional: - Add a CONFIG_STACKDEPOT_ALWAYS_INIT flag to keep using the current well-defined point of allocation as part of mem_init(). Make CONFIG_KASAN select this flag. - Other users have to call stack_depot_init() as part of their own init when it's determined that stack depot will actually be used. This may depend on both config and runtime conditions. Convert current users which are page_owner and several in the DRM subsystem. Same will be done for SLUB later. - Because the init might now be called after the boot-time memblock allocation has given all memory to the buddy allocator, change stack_depot_init() to allocate stack_table with kvmalloc() when memblock is no longer available. Also handle allocation failure by disabling stackdepot (could have theoretically happened even with memblock allocation previously), and don't unnecessarily align the memblock allocation to its own size anymore. [1] https://lore.kernel.org/all/CAMuHMdW=eoVzM1Re5FVoEN87nKfiLmM2+Ah7eNu2KXEhCvbZyA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20211013073005.11351-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Dmitry Vyukov Reviewed-by: Marco Elver # stackdepot Cc: Marco Elver Cc: Vijayanand Jitta Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Daniel Vetter Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Geert Uytterhoeven Cc: Oliver Glitta Cc: Imran Khan From: Colin Ian King Subject: lib/stackdepot: fix spelling mistake and grammar in pr_err message There is a spelling mistake of the work allocation so fix this and re-phrase the message to make it easier to read. Link: https://lkml.kernel.org/r/20211015104159.11282-1-colin.king@canonical.com Signed-off-by: Colin Ian King Cc: Vlastimil Babka From: Vlastimil Babka Subject: lib/stackdepot: allow optional init and stack_table allocation by kvmalloc() - fixup On FLATMEM, we call page_ext_init_flatmem_late() just before kmem_cache_init() which means stack_depot_init() (called by page owner init) will not recognize properly it should use kvmalloc() and not memblock_alloc(). memblock_alloc() will also not issue a warning and return a block memory that can be invalid and cause kernel page fault when saving stacks, as reported by the kernel test robot [1]. Fix this by moving page_ext_init_flatmem_late() below kmem_cache_init() so that slab_is_available() is true during stack_depot_init(). SPARSEMEM doesn't have this issue, as it doesn't do page_ext_init_flatmem_late(), but a different page_ext_init() even later in the boot process. Thanks to Mike Rapoport for pointing out the FLATMEM init ordering issue. While at it, also actually resolve a checkpatch warning in stack_depot_init() from DRM CI, which was supposed to be in the original patch already. [1] https://lore.kernel.org/all/20211014085450.GC18719@xsang-OptiPlex-9020/ Link: https://lkml.kernel.org/r/6abd9213-19a9-6d58-cedc-2414386d2d81@suse.cz Signed-off-by: Vlastimil Babka Reported-by: kernel test robot Cc: Mike Rapoport Cc: Stephen Rothwell From: Vlastimil Babka Subject: lib/stackdepot: allow optional init and stack_table allocation by kvmalloc() - fixup3 Due to cd06ab2fd48f ("drm/locking: add backtrace for locking contended locks without backoff") landing recently to -next adding a new stack depot user in drivers/gpu/drm/drm_modeset_lock.c we need to add an appropriate call to stack_depot_init() there as well. Link: https://lkml.kernel.org/r/2a692365-cfa1-64f2-34e0-8aa5674dce5e@suse.cz Signed-off-by: Vlastimil Babka Cc: Jani Nikula Cc: Naresh Kamboju Cc: Marco Elver Cc: Vijayanand Jitta Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Daniel Vetter Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Geert Uytterhoeven Cc: Oliver Glitta Cc: Imran Khan Cc: Stephen Rothwell From: Vlastimil Babka Subject: lib/stackdepot: allow optional init and stack_table allocation by kvmalloc() - fixup4 Due to 4e66934eaadc ("lib: add reference counting tracking infrastructure") landing recently to net-next adding a new stack depot user in lib/ref_tracker.c we need to add an appropriate call to stack_depot_init() there as well. Link: https://lkml.kernel.org/r/45c1b738-1a2f-5b5f-2f6d-86fab206d01c@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Eric Dumazet Cc: Jiri Slab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ref_tracker.h | 2 ++ include/linux/stackdepot.h | 25 ++++++++++++++++--------- 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ref_tracker.h b/include/linux/ref_tracker.h index c11c9db5825c..60f3453be23e 100644 --- a/include/linux/ref_tracker.h +++ b/include/linux/ref_tracker.h @@ -4,6 +4,7 @@ #include #include #include +#include struct ref_tracker; @@ -26,6 +27,7 @@ static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir, spin_lock_init(&dir->lock); dir->quarantine_avail = quarantine_count; refcount_set(&dir->untracked, 1); + stack_depot_init(); } void ref_tracker_dir_exit(struct ref_tracker_dir *dir); diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index c34b55a6e554..17f992fe6355 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -19,6 +19,22 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags, bool can_alloc); +/* + * Every user of stack depot has to call this during its own init when it's + * decided that it will be calling stack_depot_save() later. + * + * The alternative is to select STACKDEPOT_ALWAYS_INIT to have stack depot + * enabled as part of mm_init(), for subsystems where it's known at compile time + * that stack depot will be used. + */ +int stack_depot_init(void); + +#ifdef CONFIG_STACKDEPOT_ALWAYS_INIT +static inline int stack_depot_early_init(void) { return stack_depot_init(); } +#else +static inline int stack_depot_early_init(void) { return 0; } +#endif + depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); @@ -30,13 +46,4 @@ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, void stack_depot_print(depot_stack_handle_t stack); -#ifdef CONFIG_STACKDEPOT -int stack_depot_init(void); -#else -static inline int stack_depot_init(void) -{ - return 0; -} -#endif /* CONFIG_STACKDEPOT */ - #endif -- cgit v1.2.3 From 0a4ee518185e902758191d968600399f3bc2be31 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:34 -0800 Subject: mm: remove cleancache Patch series "remove Xen tmem leftovers". Since the removal of the Xen tmem driver in 2019, the cleancache hooks are entirely unused, as are large parts of frontswap. This series against linux-next (with the folio changes included) removes cleancaches, and cuts down frontswap to the bits actually used by zswap. This patch (of 13): The cleancache subsystem is unused since the removal of Xen tmem driver in commit 814bbf49dcd0 ("xen: remove tmem driver"). [akpm@linux-foundation.org: remove now-unreachable code] Link: https://lkml.kernel.org/r/20211224062246.1258487-1-hch@lst.de Link: https://lkml.kernel.org/r/20211224062246.1258487-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Acked-by: Geert Uytterhoeven Cc: Konrad Rzeszutek Wilk Cc: Hugh Dickins Cc: Seth Jennings Cc: Dan Streetman Cc: Vitaly Wool Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cleancache.h | 124 --------------------------------------------- include/linux/fs.h | 5 -- 2 files changed, 129 deletions(-) delete mode 100644 include/linux/cleancache.h (limited to 'include/linux') diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h deleted file mode 100644 index 5f5730c1d324..000000000000 --- a/include/linux/cleancache.h +++ /dev/null @@ -1,124 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_CLEANCACHE_H -#define _LINUX_CLEANCACHE_H - -#include -#include -#include - -#define CLEANCACHE_NO_POOL -1 -#define CLEANCACHE_NO_BACKEND -2 -#define CLEANCACHE_NO_BACKEND_SHARED -3 - -#define CLEANCACHE_KEY_MAX 6 - -/* - * cleancache requires every file with a page in cleancache to have a - * unique key unless/until the file is removed/truncated. For some - * filesystems, the inode number is unique, but for "modern" filesystems - * an exportable filehandle is required (see exportfs.h) - */ -struct cleancache_filekey { - union { - ino_t ino; - __u32 fh[CLEANCACHE_KEY_MAX]; - u32 key[CLEANCACHE_KEY_MAX]; - } u; -}; - -struct cleancache_ops { - int (*init_fs)(size_t); - int (*init_shared_fs)(uuid_t *uuid, size_t); - int (*get_page)(int, struct cleancache_filekey, - pgoff_t, struct page *); - void (*put_page)(int, struct cleancache_filekey, - pgoff_t, struct page *); - void (*invalidate_page)(int, struct cleancache_filekey, pgoff_t); - void (*invalidate_inode)(int, struct cleancache_filekey); - void (*invalidate_fs)(int); -}; - -extern int cleancache_register_ops(const struct cleancache_ops *ops); -extern void __cleancache_init_fs(struct super_block *); -extern void __cleancache_init_shared_fs(struct super_block *); -extern int __cleancache_get_page(struct page *); -extern void __cleancache_put_page(struct page *); -extern void __cleancache_invalidate_page(struct address_space *, struct page *); -extern void __cleancache_invalidate_inode(struct address_space *); -extern void __cleancache_invalidate_fs(struct super_block *); - -#ifdef CONFIG_CLEANCACHE -#define cleancache_enabled (1) -static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping) -{ - return mapping->host->i_sb->cleancache_poolid >= 0; -} -static inline bool cleancache_fs_enabled(struct page *page) -{ - return cleancache_fs_enabled_mapping(page->mapping); -} -#else -#define cleancache_enabled (0) -#define cleancache_fs_enabled(_page) (0) -#define cleancache_fs_enabled_mapping(_page) (0) -#endif - -/* - * The shim layer provided by these inline functions allows the compiler - * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE - * is disabled, to a single global variable check if CONFIG_CLEANCACHE - * is enabled but no cleancache "backend" has dynamically enabled it, - * and, for the most frequent cleancache ops, to a single global variable - * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled - * and a cleancache backend has dynamically enabled cleancache, but the - * filesystem referenced by that cleancache op has not enabled cleancache. - * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially - * no measurable performance impact. - */ - -static inline void cleancache_init_fs(struct super_block *sb) -{ - if (cleancache_enabled) - __cleancache_init_fs(sb); -} - -static inline void cleancache_init_shared_fs(struct super_block *sb) -{ - if (cleancache_enabled) - __cleancache_init_shared_fs(sb); -} - -static inline int cleancache_get_page(struct page *page) -{ - if (cleancache_enabled && cleancache_fs_enabled(page)) - return __cleancache_get_page(page); - return -1; -} - -static inline void cleancache_put_page(struct page *page) -{ - if (cleancache_enabled && cleancache_fs_enabled(page)) - __cleancache_put_page(page); -} - -static inline void cleancache_invalidate_page(struct address_space *mapping, - struct page *page) -{ - /* careful... page->mapping is NULL sometimes when this is called */ - if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) - __cleancache_invalidate_page(mapping, page); -} - -static inline void cleancache_invalidate_inode(struct address_space *mapping) -{ - if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) - __cleancache_invalidate_inode(mapping); -} - -static inline void cleancache_invalidate_fs(struct super_block *sb) -{ - if (cleancache_enabled) - __cleancache_invalidate_fs(sb); -} - -#endif /* _LINUX_CLEANCACHE_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 9617dea24978..f3daaea16554 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1535,11 +1535,6 @@ struct super_block { const struct dentry_operations *s_d_op; /* default d_op for dentries */ - /* - * Saved pool identifier for cleancache (-1 means none) - */ - int cleancache_poolid; - struct shrinker s_shrink; /* per-sb shrinker handle */ /* Number of inodes with nlink == 0 but still referenced */ -- cgit v1.2.3 From 3d6035f136009f9cae380022754cba31f32570c5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:38 -0800 Subject: frontswap: remove frontswap_writethrough frontswap_writethrough is never called, so remove it. Link: https://lkml.kernel.org/r/20211224062246.1258487-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index b07d88c92bb2..4a03fda41572 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -26,7 +26,6 @@ struct frontswap_ops { extern void frontswap_register_ops(struct frontswap_ops *ops); extern void frontswap_shrink(unsigned long); extern unsigned long frontswap_curr_pages(void); -extern void frontswap_writethrough(bool); #define FRONTSWAP_HAS_EXCLUSIVE_GETS extern void frontswap_tmem_exclusive_gets(bool); -- cgit v1.2.3 From 71024cb4a0bfe7767aec7a128d0a1a13a37b7fcd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:41 -0800 Subject: frontswap: remove frontswap_tmem_exclusive_gets frontswap_tmem_exclusive_gets is never called, so remove it. Link: https://lkml.kernel.org/r/20211224062246.1258487-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 4a03fda41572..83a56392cc7f 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -26,8 +26,6 @@ struct frontswap_ops { extern void frontswap_register_ops(struct frontswap_ops *ops); extern void frontswap_shrink(unsigned long); extern unsigned long frontswap_curr_pages(void); -#define FRONTSWAP_HAS_EXCLUSIVE_GETS -extern void frontswap_tmem_exclusive_gets(bool); extern bool __frontswap_test(struct swap_info_struct *, pgoff_t); extern void __frontswap_init(unsigned type, unsigned long *map); -- cgit v1.2.3 From 0b364446d734da76e421dbfb09e5268270cefaf0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:44 -0800 Subject: frontswap: remove frontswap_shrink frontswap_shrink is never called, so remove it. Link: https://lkml.kernel.org/r/20211224062246.1258487-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 83a56392cc7f..d268d7bb6513 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -24,7 +24,6 @@ struct frontswap_ops { }; extern void frontswap_register_ops(struct frontswap_ops *ops); -extern void frontswap_shrink(unsigned long); extern unsigned long frontswap_curr_pages(void); extern bool __frontswap_test(struct swap_info_struct *, pgoff_t); -- cgit v1.2.3 From 3e8e1af63d7a831f576477c25d9b89049bd2d53d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:47 -0800 Subject: frontswap: remove frontswap_curr_pages frontswap_curr_pages is never called, so remove it. Link: https://lkml.kernel.org/r/20211224062246.1258487-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index d268d7bb6513..5205c2977b20 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -24,7 +24,6 @@ struct frontswap_ops { }; extern void frontswap_register_ops(struct frontswap_ops *ops); -extern unsigned long frontswap_curr_pages(void); extern bool __frontswap_test(struct swap_info_struct *, pgoff_t); extern void __frontswap_init(unsigned type, unsigned long *map); -- cgit v1.2.3 From 1cf53c894d15dd4b73397a56fa055d76d3db66b4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:51 -0800 Subject: frontswap: simplify frontswap_init Just use IS_ENABLED() and remove the __frontswap_init indirection. Also remove the unused export. Link: https://lkml.kernel.org/r/20211224062246.1258487-7-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 5205c2977b20..73d7beb44f2b 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -26,7 +26,7 @@ struct frontswap_ops { extern void frontswap_register_ops(struct frontswap_ops *ops); extern bool __frontswap_test(struct swap_info_struct *, pgoff_t); -extern void __frontswap_init(unsigned type, unsigned long *map); +extern void frontswap_init(unsigned type, unsigned long *map); extern int __frontswap_store(struct page *page); extern int __frontswap_load(struct page *page); extern void __frontswap_invalidate_page(unsigned, pgoff_t); @@ -107,11 +107,4 @@ static inline void frontswap_invalidate_area(unsigned type) __frontswap_invalidate_area(type); } -static inline void frontswap_init(unsigned type, unsigned long *map) -{ -#ifdef CONFIG_FRONTSWAP - __frontswap_init(type, map); -#endif -} - #endif /* _LINUX_FRONTSWAP_H */ -- cgit v1.2.3 From 10a9c496789fe2098bfc018650fc77b23ba08a54 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:57 -0800 Subject: mm: simplify try_to_unuse Remove the unused frontswap and pages_to_unuse arguments, and mark the function static now that the caller in frontswap is gone. [akpm@linux-foundation.org: fix shmem_unuse() stub, per Matthew] Link: https://lkml.kernel.org/r/20211224062246.1258487-9-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Cc: Naresh Kamboju Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 7 ------- include/linux/shmem_fs.h | 3 +-- include/linux/swapfile.h | 1 - 3 files changed, 1 insertion(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 73d7beb44f2b..a9817d4fa74c 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -7,13 +7,6 @@ #include #include -/* - * Return code to denote that requested number of - * frontswap pages are unused(moved to page cache). - * Used in shmem_unuse and try_to_unuse. - */ -#define FRONTSWAP_PAGES_UNUSED 2 - struct frontswap_ops { void (*init)(unsigned); /* this swap type was just swapon'ed */ int (*store)(unsigned, pgoff_t, struct page *); /* store a page */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 166158b6e917..e65b80ed09e7 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -83,8 +83,7 @@ extern void shmem_unlock_mapping(struct address_space *mapping); extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); -extern int shmem_unuse(unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse); +int shmem_unuse(unsigned int type); extern bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, pgoff_t index); diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index e06febf62978..809cd01ef2c5 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h @@ -9,7 +9,6 @@ extern spinlock_t swap_lock; extern struct plist_head swap_active_head; extern struct swap_info_struct *swap_info[]; -extern int try_to_unuse(unsigned int, bool, unsigned long); extern unsigned long generic_max_swapfile_size(void); extern unsigned long max_swapfile_size(void); -- cgit v1.2.3 From bd9cd521496ba8d537d8f46f4167bf4221aba9a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:15:01 -0800 Subject: frontswap: remove frontswap_test frontswap_test is unused now, remove it. Link: https://lkml.kernel.org/r/20211224062246.1258487-10-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index a9817d4fa74c..c5b2848d2240 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -18,7 +18,6 @@ struct frontswap_ops { extern void frontswap_register_ops(struct frontswap_ops *ops); -extern bool __frontswap_test(struct swap_info_struct *, pgoff_t); extern void frontswap_init(unsigned type, unsigned long *map); extern int __frontswap_store(struct page *page); extern int __frontswap_load(struct page *page); @@ -33,11 +32,6 @@ static inline bool frontswap_enabled(void) return static_branch_unlikely(&frontswap_enabled_key); } -static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) -{ - return __frontswap_test(sis, offset); -} - static inline void frontswap_map_set(struct swap_info_struct *p, unsigned long *map) { @@ -56,11 +50,6 @@ static inline bool frontswap_enabled(void) return false; } -static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) -{ - return false; -} - static inline void frontswap_map_set(struct swap_info_struct *p, unsigned long *map) { -- cgit v1.2.3 From 633423a09cb5cfe61438283e1ce49c23cf4a0611 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:15:07 -0800 Subject: mm: mark swap_lock and swap_active_head static swap_lock and swap_active_head are only used in swapfile.c, so mark them static. Link: https://lkml.kernel.org/r/20211224062246.1258487-12-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swapfile.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index 809cd01ef2c5..54078542134c 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h @@ -6,8 +6,6 @@ * these were static in swapfile.c but frontswap.c needs them and we don't * want to expose them to the dozens of source files that include swap.h */ -extern spinlock_t swap_lock; -extern struct plist_head swap_active_head; extern struct swap_info_struct *swap_info[]; extern unsigned long generic_max_swapfile_size(void); extern unsigned long max_swapfile_size(void); -- cgit v1.2.3 From 1da0d94a3ec8c5f3793b7be8538b55e60ebeefe3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:15:10 -0800 Subject: frontswap: remove support for multiple ops There is only a single instance of frontswap ops in the kernel, so simplify the frontswap code by removing support for multiple operations. Link: https://lkml.kernel.org/r/20211224062246.1258487-13-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index c5b2848d2240..a631bac12220 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -13,10 +13,9 @@ struct frontswap_ops { int (*load)(unsigned, pgoff_t, struct page *); /* load a page */ void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */ void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */ - struct frontswap_ops *next; /* private pointer to next ops */ }; -extern void frontswap_register_ops(struct frontswap_ops *ops); +int frontswap_register_ops(const struct frontswap_ops *ops); extern void frontswap_init(unsigned type, unsigned long *map); extern int __frontswap_store(struct page *page); -- cgit v1.2.3 From 25d490eb46486e88c16e64d9eb7cfd33a642d596 Mon Sep 17 00:00:00 2001 From: Wang Kefeng Date: Sat, 18 Dec 2021 09:30:40 +0100 Subject: ARM: 9172/1: amba: Cleanup amba pclk operation There is no user about amba_pclk_[un]prepare() besides pl330.c, directly use clk_[un]prepare(). After this, all the function about amba pclk operation, enable, disable, [un]prepare could be killed. Acked-by: Vinod Koul Signed-off-by: Kefeng Wang Signed-off-by: Russell King (Oracle) --- include/linux/amba/bus.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index 6c7f47846971..09174970b855 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -121,26 +121,6 @@ struct amba_device *amba_find_device(const char *, struct device *, unsigned int int amba_request_regions(struct amba_device *, const char *); void amba_release_regions(struct amba_device *); -static inline int amba_pclk_enable(struct amba_device *dev) -{ - return clk_enable(dev->pclk); -} - -static inline void amba_pclk_disable(struct amba_device *dev) -{ - clk_disable(dev->pclk); -} - -static inline int amba_pclk_prepare(struct amba_device *dev) -{ - return clk_prepare(dev->pclk); -} - -static inline void amba_pclk_unprepare(struct amba_device *dev) -{ - clk_unprepare(dev->pclk); -} - /* Some drivers don't use the struct amba_device */ #define AMBA_CONFIG_BITS(a) (((a) >> 24) & 0xff) #define AMBA_REV_BITS(a) (((a) >> 20) & 0x0f) -- cgit v1.2.3 From dacf3ca134d0dc105caee77651a349a86bd77456 Mon Sep 17 00:00:00 2001 From: Wang Kefeng Date: Sat, 18 Dec 2021 09:30:41 +0100 Subject: ARM: 9173/1: amba: kill amba_find_match() There is no one use amba_find_match(), kill it. Signed-off-by: Kefeng Wang Signed-off-by: Russell King (Oracle) --- include/linux/amba/bus.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index 09174970b855..6562f543c3e0 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -117,7 +117,6 @@ void amba_device_put(struct amba_device *); int amba_device_add(struct amba_device *, struct resource *); int amba_device_register(struct amba_device *, struct resource *); void amba_device_unregister(struct amba_device *); -struct amba_device *amba_find_device(const char *, struct device *, unsigned int, unsigned int); int amba_request_regions(struct amba_device *, const char *); void amba_release_regions(struct amba_device *); -- cgit v1.2.3