From 3f1b623a1be92103386bcab818e25885d6be9419 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 23 Oct 2020 17:00:41 +0800 Subject: vdpa: introduce config op to get valid iova range This patch introduce a config op to get valid iova range from the vDPA device. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20201023090043.14430-2-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index eae0bfd87d91..30bc7a7223bb 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -52,6 +52,16 @@ struct vdpa_device { int nvqs; }; +/** + * vDPA IOVA range - the IOVA range support by the device + * @first: start of the IOVA range + * @last: end of the IOVA range + */ +struct vdpa_iova_range { + u64 first; + u64 last; +}; + /** * vDPA_config_ops - operations for configuring a vDPA device. * Note: vDPA device drivers are required to implement all of the @@ -151,6 +161,10 @@ struct vdpa_device { * @get_generation: Get device config generation (optional) * @vdev: vdpa device * Returns u32: device generation + * @get_iova_range: Get supported iova range (optional) + * @vdev: vdpa device + * Returns the iova range supported by + * the device. * @set_map: Set device memory mapping (optional) * Needed for device that using device * specific DMA translation (on-chip IOMMU) @@ -216,6 +230,7 @@ struct vdpa_config_ops { void (*set_config)(struct vdpa_device *vdev, unsigned int offset, const void *buf, unsigned int len); u32 (*get_generation)(struct vdpa_device *vdev); + struct vdpa_iova_range (*get_iova_range)(struct vdpa_device *vdev); /* DMA ops */ int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb); -- cgit v1.2.3 From cb47755725da7b90fecbb2aa82ac3b24a7adb89b Mon Sep 17 00:00:00 2001 From: Zeng Tao Date: Tue, 1 Sep 2020 17:30:13 +0800 Subject: time: Prevent undefined behaviour in timespec64_to_ns() UBSAN reports: Undefined behaviour in ./include/linux/time64.h:127:27 signed integer overflow: 17179869187 * 1000000000 cannot be represented in type 'long long int' Call Trace: timespec64_to_ns include/linux/time64.h:127 [inline] set_cpu_itimer+0x65c/0x880 kernel/time/itimer.c:180 do_setitimer+0x8e/0x740 kernel/time/itimer.c:245 __x64_sys_setitimer+0x14c/0x2c0 kernel/time/itimer.c:336 do_syscall_64+0xa1/0x540 arch/x86/entry/common.c:295 Commit bd40a175769d ("y2038: itimer: change implementation to timespec64") replaced the original conversion which handled time clamping correctly with timespec64_to_ns() which has no overflow protection. Fix it in timespec64_to_ns() as this is not necessarily limited to the usage in itimers. [ tglx: Added comment and adjusted the fixes tag ] Fixes: 361a3bf00582 ("time64: Add time64.h header and define struct timespec64") Signed-off-by: Zeng Tao Signed-off-by: Thomas Gleixner Reviewed-by: Arnd Bergmann Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/1598952616-6416-1-git-send-email-prime.zeng@hisilicon.com --- include/linux/time64.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/time64.h b/include/linux/time64.h index c9dcb3e5781f..5117cb5b5656 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -124,6 +124,10 @@ static inline bool timespec64_valid_settod(const struct timespec64 *ts) */ static inline s64 timespec64_to_ns(const struct timespec64 *ts) { + /* Prevent multiplication overflow */ + if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) + return KTIME_MAX; + return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; } -- cgit v1.2.3 From fbdd0049d98d44914fc57d4b91f867f4996c787b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 26 Oct 2020 15:43:59 +0200 Subject: RDMA/mlx5: Fix devlink deadlock on net namespace deletion When a mlx5 core devlink instance is reloaded in different net namespace, its associated IB device is deleted and recreated. Example sequence is: $ ip netns add foo $ devlink dev reload pci/0000:00:08.0 netns foo $ ip netns del foo mlx5 IB device needs to attach and detach the netdevice to it through the netdev notifier chain during load and unload sequence. A below call graph of the unload flow. cleanup_net() down_read(&pernet_ops_rwsem); <- first sem acquired ops_pre_exit_list() pre_exit() devlink_pernet_pre_exit() devlink_reload() mlx5_devlink_reload_down() mlx5_unload_one() [...] mlx5_ib_remove() mlx5_ib_unbind_slave_port() mlx5_remove_netdev_notifier() unregister_netdevice_notifier() down_write(&pernet_ops_rwsem);<- recurrsive lock Hence, when net namespace is deleted, mlx5 reload results in deadlock. When deadlock occurs, devlink mutex is also held. This not only deadlocks the mlx5 device under reload, but all the processes which attempt to access unrelated devlink devices are deadlocked. Hence, fix this by mlx5 ib driver to register for per net netdev notifier instead of global one, which operats on the net namespace without holding the pernet_ops_rwsem. Fixes: 4383cfcc65e7 ("net/mlx5: Add devlink reload") Link: https://lore.kernel.org/r/20201026134359.23150-1-parav@nvidia.com Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/mlx5/driver.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index add85094f9a5..0f23e1ed5e71 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1213,4 +1213,22 @@ static inline bool mlx5_is_roce_enabled(struct mlx5_core_dev *dev) return val.vbool; } +/** + * mlx5_core_net - Provide net namespace of the mlx5_core_dev + * @dev: mlx5 core device + * + * mlx5_core_net() returns the net namespace of mlx5 core device. + * This can be called only in below described limited context. + * (a) When a devlink instance for mlx5_core is registered and + * when devlink reload operation is disabled. + * or + * (b) during devlink reload reload_down() and reload_up callbacks + * where it is ensured that devlink instance's net namespace is + * stable. + */ +static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev) +{ + return devlink_net(priv_to_devlink(dev)); +} + #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From 343a3e8bc635bd4c58d45a4fe67f9c3a78fbd191 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 26 Oct 2020 17:20:50 +0100 Subject: bpf: Fix -Wshadow warnings There are thousands of warnings about one macro in a W=2 build: include/linux/filter.h:561:6: warning: declaration of 'ret' shadows a previous local [-Wshadow] Prefix all the locals in that macro with __ to avoid most of these warnings. Fixes: 492ecee892c2 ("bpf: enable program stats") Signed-off-by: Arnd Bergmann Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201026162110.3710415-1-arnd@kernel.org --- include/linux/filter.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 72d62cbc1578..1b62397bd124 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -558,21 +558,21 @@ struct sk_filter { DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); #define __BPF_PROG_RUN(prog, ctx, dfunc) ({ \ - u32 ret; \ + u32 __ret; \ cant_migrate(); \ if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ - struct bpf_prog_stats *stats; \ - u64 start = sched_clock(); \ - ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ - stats = this_cpu_ptr(prog->aux->stats); \ - u64_stats_update_begin(&stats->syncp); \ - stats->cnt++; \ - stats->nsecs += sched_clock() - start; \ - u64_stats_update_end(&stats->syncp); \ + struct bpf_prog_stats *__stats; \ + u64 __start = sched_clock(); \ + __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ + __stats = this_cpu_ptr(prog->aux->stats); \ + u64_stats_update_begin(&__stats->syncp); \ + __stats->cnt++; \ + __stats->nsecs += sched_clock() - __start; \ + u64_stats_update_end(&__stats->syncp); \ } else { \ - ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ + __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ } \ - ret; }) + __ret; }) #define BPF_PROG_RUN(prog, ctx) \ __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func) -- cgit v1.2.3 From 1c534352f47fd83eb08075ac2474f707e74bf7f7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 23 Oct 2020 17:35:19 +0200 Subject: cpufreq: Introduce CPUFREQ_NEED_UPDATE_LIMITS driver flag Generally, a cpufreq driver may need to update some internal upper and lower frequency boundaries on policy max and min changes, respectively, but currently this does not work if the target frequency does not change along with the policy limit. Namely, if the target frequency does not change along with the policy min or max, the "target_freq == policy->cur" check in __cpufreq_driver_target() prevents driver callbacks from being invoked and they do not even have a chance to update the corresponding internal boundary. This particularly affects the "powersave" and "performance" governors that always set the target frequency to one of the policy limits and it never changes when the other limit is updated. To allow cpufreq the drivers needing to update internal frequency boundaries on policy limits changes to avoid this issue, introduce a new driver flag, CPUFREQ_NEED_UPDATE_LIMITS, that (when set) will neutralize the check mentioned above. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- include/linux/cpufreq.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index fa37b1c66443..038ed83aab41 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -298,7 +298,7 @@ __ATTR(_name, 0644, show_##_name, store_##_name) struct cpufreq_driver { char name[CPUFREQ_NAME_LEN]; - u8 flags; + u16 flags; void *driver_data; /* needed by all drivers */ @@ -422,6 +422,14 @@ struct cpufreq_driver { */ #define CPUFREQ_IS_COOLING_DEV BIT(7) +/* + * Set by drivers that need to update internale upper and lower boundaries along + * with the target frequency and so the core and governors should also invoke + * the diver if the target frequency does not change, but the policy min or max + * may have changed. + */ +#define CPUFREQ_NEED_UPDATE_LIMITS BIT(8) + int cpufreq_register_driver(struct cpufreq_driver *driver_data); int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); -- cgit v1.2.3 From 1de111b51b829bcf01d2e57971f8fd07a665fa3f Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Fri, 23 Oct 2020 08:47:50 -0700 Subject: KVM: arm64: ARM_SMCCC_ARCH_WORKAROUND_1 doesn't return SMCCC_RET_NOT_REQUIRED According to the SMCCC spec[1](7.5.2 Discovery) the ARM_SMCCC_ARCH_WORKAROUND_1 function id only returns 0, 1, and SMCCC_RET_NOT_SUPPORTED. 0 is "workaround required and safe to call this function" 1 is "workaround not required but safe to call this function" SMCCC_RET_NOT_SUPPORTED is "might be vulnerable or might not be, who knows, I give up!" SMCCC_RET_NOT_SUPPORTED might as well mean "workaround required, except calling this function may not work because it isn't implemented in some cases". Wonderful. We map this SMC call to 0 is SPECTRE_MITIGATED 1 is SPECTRE_UNAFFECTED SMCCC_RET_NOT_SUPPORTED is SPECTRE_VULNERABLE For KVM hypercalls (hvc), we've implemented this function id to return SMCCC_RET_NOT_SUPPORTED, 0, and SMCCC_RET_NOT_REQUIRED. One of those isn't supposed to be there. Per the code we call arm64_get_spectre_v2_state() to figure out what to return for this feature discovery call. 0 is SPECTRE_MITIGATED SMCCC_RET_NOT_REQUIRED is SPECTRE_UNAFFECTED SMCCC_RET_NOT_SUPPORTED is SPECTRE_VULNERABLE Let's clean this up so that KVM tells the guest this mapping: 0 is SPECTRE_MITIGATED 1 is SPECTRE_UNAFFECTED SMCCC_RET_NOT_SUPPORTED is SPECTRE_VULNERABLE Note: SMCCC_RET_NOT_AFFECTED is 1 but isn't part of the SMCCC spec Fixes: c118bbb52743 ("arm64: KVM: Propagate full Spectre v2 workaround state to KVM guests") Signed-off-by: Stephen Boyd Acked-by: Marc Zyngier Acked-by: Will Deacon Cc: Andre Przywara Cc: Steven Price Cc: Marc Zyngier Cc: stable@vger.kernel.org Link: https://developer.arm.com/documentation/den0028/latest [1] Link: https://lore.kernel.org/r/20201023154751.1973872-1-swboyd@chromium.org Signed-off-by: Will Deacon --- include/linux/arm-smccc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 885c9ffc835c..f860645f6512 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -87,6 +87,8 @@ ARM_SMCCC_SMC_32, \ 0, 0x7fff) +#define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED 1 + /* Paravirtualised time calls (defined by ARM DEN0057A) */ #define ARM_SMCCC_HV_PV_TIME_FEATURES \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ -- cgit v1.2.3 From cbdc0f54560f94c2205ddbebb5464d65868af0d8 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 23 Oct 2020 18:33:18 +0200 Subject: usb: fix kernel-doc markups There is a common comment marked, instead, with kernel-doc notation. Also, some identifiers have different names between their prototypes and the kernel-doc markup. Signed-off-by: Mauro Carvalho Chehab Acked-by: Felipe Balbi Link: https://lore.kernel.org/r/0b964be3884def04fcd20ea5c12cb90d0014871c.1603469755.git.mchehab+huawei@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/composite.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h index 2040696d75b6..a2d229ab63ba 100644 --- a/include/linux/usb/composite.h +++ b/include/linux/usb/composite.h @@ -437,7 +437,7 @@ static inline struct usb_composite_driver *to_cdriver( #define OS_STRING_IDX 0xEE /** - * struct usb_composite_device - represents one composite usb gadget + * struct usb_composite_dev - represents one composite usb gadget * @gadget: read-only, abstracts the gadget's usb peripheral controller * @req: used for control responses; buffer is pre-allocated * @os_desc_req: used for OS descriptors responses; buffer is pre-allocated -- cgit v1.2.3 From 13150bc5416f45234c955e5bed91623d178c6117 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 27 Oct 2020 16:11:32 +0100 Subject: module: use hidden visibility for weak symbol references Geert reports that commit be2881824ae9eb92 ("arm64/build: Assert for unwanted sections") results in build errors on arm64 for configurations that have CONFIG_MODULES disabled. The commit in question added ASSERT()s to the arm64 linker script to ensure that linker generated sections such as .got.plt etc are empty, but as it turns out, there are corner cases where the linker does emit content into those sections. More specifically, weak references to function symbols (which can remain unsatisfied, and can therefore not be emitted as relative references) will be emitted as GOT and PLT entries when linking the kernel in PIE mode (which is the case when CONFIG_RELOCATABLE is enabled, which is on by default). What happens is that code such as struct device *(*fn)(struct device *dev); struct device *iommu_device; fn = symbol_get(mdev_get_iommu_device); if (fn) { iommu_device = fn(dev); essentially gets converted into the following when CONFIG_MODULES is off: struct device *iommu_device; if (&mdev_get_iommu_device) { iommu_device = mdev_get_iommu_device(dev); where mdev_get_iommu_device is emitted as a weak symbol reference into the object file. The first reference is decorated with an ordinary ABS64 data relocation (which yields 0x0 if the reference remains unsatisfied). However, the indirect call is turned into a direct call covered by a R_AARCH64_CALL26 relocation, which is converted into a call via a PLT entry taking the target address from the associated GOT entry. Given that such GOT and PLT entries are unnecessary for fully linked binaries such as the kernel, let's give these weak symbol references hidden visibility, so that the linker knows that the weak reference via R_AARCH64_CALL26 can simply remain unsatisfied. Signed-off-by: Ard Biesheuvel Tested-by: Geert Uytterhoeven Reviewed-by: Fangrui Song Acked-by: Jessica Yu Cc: Jessica Yu Cc: Kees Cook Cc: Geert Uytterhoeven Cc: Nick Desaulniers Link: https://lore.kernel.org/r/20201027151132.14066-1-ardb@kernel.org Signed-off-by: Will Deacon --- include/linux/module.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 7ccdf87f376f..6264617bab4d 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -740,7 +740,7 @@ static inline bool within_module(unsigned long addr, const struct module *mod) } /* Get/put a kernel symbol (calls should be symmetric) */ -#define symbol_get(x) ({ extern typeof(x) x __attribute__((weak)); &(x); }) +#define symbol_get(x) ({ extern typeof(x) x __attribute__((weak,visibility("hidden"))); &(x); }) #define symbol_put(x) do { } while (0) #define symbol_put_addr(x) do { } while (0) -- cgit v1.2.3 From 6a6223ec7779dfdabb9c2567bb42079bc300cf27 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 27 Oct 2020 10:51:13 +0100 Subject: blk-mq: docs: add kernel-doc description for a new struct member As reported by kernel-doc: ./include/linux/blk-mq.h:267: warning: Function parameter or member 'active_queues_shared_sbitmap' not described in 'blk_mq_tag_set' There is now a new member for struct blk_mq_tag_set. Add a description for it, based on the commit that introduced it. Fixes: f1b49fdc1c64 ("blk-mq: Record active_queues_shared_sbitmap per tag_set for when using shared sbitmap") Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Jens Axboe Reviewed-by: John Garry Link: https://lore.kernel.org/r/8e513153b83eefc05e358f51f2632b592c3f6772.1603791716.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- include/linux/blk-mq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b23eeca4d677..794b2a33a2c3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -235,6 +235,8 @@ enum hctx_type { * @flags: Zero or more BLK_MQ_F_* flags. * @driver_data: Pointer to data owned by the block driver that created this * tag set. + * @active_queues_shared_sbitmap: + * number of active request queues per tag set. * @__bitmap_tags: A shared tags sbitmap, used over all hctx's * @__breserved_tags: * A shared reserved tags sbitmap, used over all hctx's -- cgit v1.2.3 From 89b422354409c275e898d26607201797cc05a932 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 27 Oct 2020 10:51:17 +0100 Subject: mm: pagemap.h: fix two kernel-doc markups Changeset a8cf7f272b5a ("mm: add find_lock_head") renamed the index parameter, but forgot to update the kernel-doc markups accordingly. Fixes: a8cf7f272b5a ("mm: add find_lock_head") Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/dce89b296a4f5f9f8f798d5e76b6736c14a916ac.1603791716.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- include/linux/pagemap.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c77b7c31b2e4..e1e19c1f9ec9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -344,9 +344,9 @@ static inline struct page *find_get_page_flags(struct address_space *mapping, /** * find_lock_page - locate, pin and lock a pagecache page * @mapping: the address_space to search - * @offset: the page index + * @index: the page index * - * Looks up the page cache entry at @mapping & @offset. If there is a + * Looks up the page cache entry at @mapping & @index. If there is a * page cache page, it is returned locked and with an increased * refcount. * @@ -363,9 +363,9 @@ static inline struct page *find_lock_page(struct address_space *mapping, /** * find_lock_head - Locate, pin and lock a pagecache page. * @mapping: The address_space to search. - * @offset: The page index. + * @index: The page index. * - * Looks up the page cache entry at @mapping & @offset. If there is a + * Looks up the page cache entry at @mapping & @index. If there is a * page cache page, its head page is returned locked and with an increased * refcount. * -- cgit v1.2.3 From e86c6569c588a01f20e7554cc245f8fae831957b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 27 Oct 2020 10:51:18 +0100 Subject: net: phy: remove kernel-doc duplication Sphinx 3 now checks for duplicated function declarations: .../Documentation/networking/kapi:143: ../include/linux/phy.h:163: WARNING: Duplicate C declaration, also defined in 'networking/kapi'. Declaration is 'unsigned int phy_supported_speeds (struct phy_device *phy, unsigned int *speeds, unsigned int size)'. .../Documentation/networking/kapi:143: ../include/linux/phy.h:1034: WARNING: Duplicate C declaration, also defined in 'networking/kapi'. Declaration is 'int phy_read_mmd (struct phy_device *phydev, int devad, u32 regnum)'. .../Documentation/networking/kapi:143: ../include/linux/phy.h:1076: WARNING: Duplicate C declaration, also defined in 'networking/kapi'. Declaration is 'int __phy_read_mmd (struct phy_device *phydev, int devad, u32 regnum)'. .../Documentation/networking/kapi:143: ../include/linux/phy.h:1088: WARNING: Duplicate C declaration, also defined in 'networking/kapi'. Declaration is 'int phy_write_mmd (struct phy_device *phydev, int devad, u32 regnum, u16 val)'. .../Documentation/networking/kapi:143: ../include/linux/phy.h:1100: WARNING: Duplicate C declaration, also defined in 'networking/kapi'. Declaration is 'int __phy_write_mmd (struct phy_device *phydev, int devad, u32 regnum, u16 val)'. It turns that both the C and the H files have the same kernel-doc markup for the same functions. Let's drop the at the header file, keeping the one closer to the code. Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/75e9a357f9a716833d2094b04898754876365e68.1603791716.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- include/linux/phy.h | 40 +++++----------------------------------- 1 file changed, 5 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index eb3cb1a98b45..56563e5e0dc7 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -147,16 +147,8 @@ typedef enum { PHY_INTERFACE_MODE_MAX, } phy_interface_t; -/** +/* * phy_supported_speeds - return all speeds currently supported by a PHY device - * @phy: The PHY device to return supported speeds of. - * @speeds: buffer to store supported speeds in. - * @size: size of speeds buffer. - * - * Description: Returns the number of supported speeds, and fills - * the speeds buffer with the supported speeds. If speeds buffer is - * too small to contain all currently supported speeds, will return as - * many speeds as can fit. */ unsigned int phy_supported_speeds(struct phy_device *phy, unsigned int *speeds, @@ -1022,14 +1014,9 @@ static inline int __phy_modify_changed(struct phy_device *phydev, u32 regnum, regnum, mask, set); } -/** +/* * phy_read_mmd - Convenience function for reading a register * from an MMD on a given PHY. - * @phydev: The phy_device struct - * @devad: The MMD to read from - * @regnum: The register on the MMD to read - * - * Same rules as for phy_read(); */ int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); @@ -1064,38 +1051,21 @@ int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); __ret; \ }) -/** +/* * __phy_read_mmd - Convenience function for reading a register * from an MMD on a given PHY. - * @phydev: The phy_device struct - * @devad: The MMD to read from - * @regnum: The register on the MMD to read - * - * Same rules as for __phy_read(); */ int __phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); -/** +/* * phy_write_mmd - Convenience function for writing a register * on an MMD on a given PHY. - * @phydev: The phy_device struct - * @devad: The MMD to write to - * @regnum: The register on the MMD to read - * @val: value to write to @regnum - * - * Same rules as for phy_write(); */ int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val); -/** +/* * __phy_write_mmd - Convenience function for writing a register * on an MMD on a given PHY. - * @phydev: The phy_device struct - * @devad: The MMD to write to - * @regnum: The register on the MMD to read - * @val: value to write to @regnum - * - * Same rules as for __phy_write(); */ int __phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val); -- cgit v1.2.3 From cf38cc9f1e71151f22584c40357afaab6609384b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 27 Oct 2020 10:51:23 +0100 Subject: locking/refcount: move kernel-doc markups to the proper place Changeset a435b9a14356 ("locking/refcount: Provide __refcount API to obtain the old value") added a set of functions starting with __ that have a new parameter, adding a series of new warnings: $ ./scripts/kernel-doc -none include/linux/refcount.h include/linux/refcount.h:169: warning: Function parameter or member 'oldp' not described in '__refcount_add_not_zero' include/linux/refcount.h:208: warning: Function parameter or member 'oldp' not described in '__refcount_add' include/linux/refcount.h:239: warning: Function parameter or member 'oldp' not described in '__refcount_inc_not_zero' include/linux/refcount.h:261: warning: Function parameter or member 'oldp' not described in '__refcount_inc' include/linux/refcount.h:291: warning: Function parameter or member 'oldp' not described in '__refcount_sub_and_test' include/linux/refcount.h:327: warning: Function parameter or member 'oldp' not described in '__refcount_dec_and_test' include/linux/refcount.h:347: warning: Function parameter or member 'oldp' not described in '__refcount_dec' The issue is that the kernel-doc markups are now misplaced, as they should be added just before the functions. So, move the kernel-doc markups to the proper places, in order to drop the warnings. It should be noticed that git show produces a crappy output, for this patch without "--patience" flag. Fixes: a435b9a14356 ("locking/refcount: Provide __refcount API to obtain the old value") Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/7985c31d1ace591bc5e1faa05c367f1295b78afd.1603791716.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- include/linux/refcount.h | 130 +++++++++++++++++++++++------------------------ 1 file changed, 65 insertions(+), 65 deletions(-) (limited to 'include/linux') diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 7fabb1af18e0..497990c69b0b 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -147,24 +147,6 @@ static inline unsigned int refcount_read(const refcount_t *r) return atomic_read(&r->refs); } -/** - * refcount_add_not_zero - add a value to a refcount unless it is 0 - * @i: the value to add to the refcount - * @r: the refcount - * - * Will saturate at REFCOUNT_SATURATED and WARN. - * - * Provides no memory ordering, it is assumed the caller has guaranteed the - * object memory to be stable (RCU, etc.). It does provide a control dependency - * and thereby orders future stores. See the comment on top. - * - * Use of this function is not recommended for the normal reference counting - * use case in which references are taken and released one at a time. In these - * cases, refcount_inc(), or one of its variants, should instead be used to - * increment a reference count. - * - * Return: false if the passed refcount is 0, true otherwise - */ static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) { int old = refcount_read(r); @@ -183,17 +165,12 @@ static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, in return old; } -static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) -{ - return __refcount_add_not_zero(i, r, NULL); -} - /** - * refcount_add - add a value to a refcount + * refcount_add_not_zero - add a value to a refcount unless it is 0 * @i: the value to add to the refcount * @r: the refcount * - * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN. + * Will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency @@ -203,7 +180,14 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) * use case in which references are taken and released one at a time. In these * cases, refcount_inc(), or one of its variants, should instead be used to * increment a reference count. + * + * Return: false if the passed refcount is 0, true otherwise */ +static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) +{ + return __refcount_add_not_zero(i, r, NULL); +} + static inline void __refcount_add(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_add_relaxed(i, &r->refs); @@ -217,11 +201,32 @@ static inline void __refcount_add(int i, refcount_t *r, int *oldp) refcount_warn_saturate(r, REFCOUNT_ADD_OVF); } +/** + * refcount_add - add a value to a refcount + * @i: the value to add to the refcount + * @r: the refcount + * + * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN. + * + * Provides no memory ordering, it is assumed the caller has guaranteed the + * object memory to be stable (RCU, etc.). It does provide a control dependency + * and thereby orders future stores. See the comment on top. + * + * Use of this function is not recommended for the normal reference counting + * use case in which references are taken and released one at a time. In these + * cases, refcount_inc(), or one of its variants, should instead be used to + * increment a reference count. + */ static inline void refcount_add(int i, refcount_t *r) { __refcount_add(i, r, NULL); } +static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp) +{ + return __refcount_add_not_zero(1, r, oldp); +} + /** * refcount_inc_not_zero - increment a refcount unless it is 0 * @r: the refcount to increment @@ -235,14 +240,14 @@ static inline void refcount_add(int i, refcount_t *r) * * Return: true if the increment was successful, false otherwise */ -static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp) +static inline __must_check bool refcount_inc_not_zero(refcount_t *r) { - return __refcount_add_not_zero(1, r, oldp); + return __refcount_inc_not_zero(r, NULL); } -static inline __must_check bool refcount_inc_not_zero(refcount_t *r) +static inline void __refcount_inc(refcount_t *r, int *oldp) { - return __refcount_inc_not_zero(r, NULL); + __refcount_add(1, r, oldp); } /** @@ -257,14 +262,27 @@ static inline __must_check bool refcount_inc_not_zero(refcount_t *r) * Will WARN if the refcount is 0, as this represents a possible use-after-free * condition. */ -static inline void __refcount_inc(refcount_t *r, int *oldp) +static inline void refcount_inc(refcount_t *r) { - __refcount_add(1, r, oldp); + __refcount_inc(r, NULL); } -static inline void refcount_inc(refcount_t *r) +static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) { - __refcount_inc(r, NULL); + int old = atomic_fetch_sub_release(i, &r->refs); + + if (oldp) + *oldp = old; + + if (old == i) { + smp_acquire__after_ctrl_dep(); + return true; + } + + if (unlikely(old < 0 || old - i < 0)) + refcount_warn_saturate(r, REFCOUNT_SUB_UAF); + + return false; } /** @@ -287,27 +305,14 @@ static inline void refcount_inc(refcount_t *r) * * Return: true if the resulting refcount is 0, false otherwise */ -static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) +static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) { - int old = atomic_fetch_sub_release(i, &r->refs); - - if (oldp) - *oldp = old; - - if (old == i) { - smp_acquire__after_ctrl_dep(); - return true; - } - - if (unlikely(old < 0 || old - i < 0)) - refcount_warn_saturate(r, REFCOUNT_SUB_UAF); - - return false; + return __refcount_sub_and_test(i, r, NULL); } -static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) +static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp) { - return __refcount_sub_and_test(i, r, NULL); + return __refcount_sub_and_test(1, r, oldp); } /** @@ -323,26 +328,11 @@ static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) * * Return: true if the resulting refcount is 0, false otherwise */ -static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp) -{ - return __refcount_sub_and_test(1, r, oldp); -} - static inline __must_check bool refcount_dec_and_test(refcount_t *r) { return __refcount_dec_and_test(r, NULL); } -/** - * refcount_dec - decrement a refcount - * @r: the refcount - * - * Similar to atomic_dec(), it will WARN on underflow and fail to decrement - * when saturated at REFCOUNT_SATURATED. - * - * Provides release memory ordering, such that prior loads and stores are done - * before. - */ static inline void __refcount_dec(refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(1, &r->refs); @@ -354,6 +344,16 @@ static inline void __refcount_dec(refcount_t *r, int *oldp) refcount_warn_saturate(r, REFCOUNT_DEC_LEAK); } +/** + * refcount_dec - decrement a refcount + * @r: the refcount + * + * Similar to atomic_dec(), it will WARN on underflow and fail to decrement + * when saturated at REFCOUNT_SATURATED. + * + * Provides release memory ordering, such that prior loads and stores are done + * before. + */ static inline void refcount_dec(refcount_t *r) { __refcount_dec(r, NULL); -- cgit v1.2.3 From e029c5f2798720b463e8df0e184a4d1036311b43 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Mon, 26 Oct 2020 21:49:14 -0700 Subject: ext4: make num of fast commit blocks configurable This patch reserves a field in the jbd2 superblock for number of fast commit blocks. When this value is non-zero, Ext4 uses this field to set the number of fast commit blocks. Fixes: 6866d7b3f2bb ("ext4/jbd2: add fast commit initialization") Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201027044915.2553163-2-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index fb3d71ad6eea..7e88bbc16ffb 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -263,7 +263,10 @@ typedef struct journal_superblock_s /* 0x0050 */ __u8 s_checksum_type; /* checksum type */ __u8 s_padding2[3]; - __u32 s_padding[42]; +/* 0x0054 */ + __be32 s_num_fc_blks; /* Number of fast commit blocks */ +/* 0x0058 */ + __u32 s_padding[41]; __be32 s_checksum; /* crc32c(superblock) */ /* 0x0100 */ -- cgit v1.2.3 From ea4b01d9b81f5f381fc6832bc31046878a2d1a5d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 27 Oct 2020 10:51:27 +0100 Subject: jbd2: fix a kernel-doc markup The kernel-doc markup that documents _fc_replay_callback is missing an asterisk, causing this warning: ../include/linux/jbd2.h:1271: warning: Function parameter or member 'j_fc_replay_callback' not described in 'journal_s' When building the docs. Fixes: 609f928af48f ("jbd2: fast commit recovery path") Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/6055927ada2015b55b413cdd2670533bdc9a8da2.1603791716.git.mchehab+huawei@kernel.org Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 7e88bbc16ffb..1d5566af48ac 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1256,7 +1256,7 @@ struct journal_s */ void (*j_fc_cleanup_callback)(struct journal_s *journal, int); - /* + /** * @j_fc_replay_callback: * * File-system specific function that performs replay of a fast -- cgit v1.2.3 From 80ade22c06ca115b81dd168e99479c8e09843513 Mon Sep 17 00:00:00 2001 From: Sudeep Dutt Date: Tue, 27 Oct 2020 20:14:15 -0700 Subject: misc: mic: remove the MIC drivers This patch removes the MIC drivers from the kernel tree since the corresponding devices have been discontinued. Removing the dma and char-misc changes in one patch and merging via the char-misc tree is best to avoid any potential build breakage. Cc: Nikhil Rao Reviewed-by: Ashutosh Dixit Signed-off-by: Sudeep Dutt Acked-By: Vinod Koul Reviewed-by: Sherry Sun Link: https://lore.kernel.org/r/8c1443136563de34699d2c084df478181c205db4.1603854416.git.sudeep.dutt@intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/mic_bus.h | 100 ---- include/linux/scif.h | 1339 ----------------------------------------------- 2 files changed, 1439 deletions(-) delete mode 100644 include/linux/mic_bus.h delete mode 100644 include/linux/scif.h (limited to 'include/linux') diff --git a/include/linux/mic_bus.h b/include/linux/mic_bus.h deleted file mode 100644 index e99c789424e0..000000000000 --- a/include/linux/mic_bus.h +++ /dev/null @@ -1,100 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel MIC Bus driver. - * - * This implementation is very similar to the virtio bus driver - * implementation @ include/linux/virtio.h. - */ -#ifndef _MIC_BUS_H_ -#define _MIC_BUS_H_ -/* - * Everything a mbus driver needs to work with any particular mbus - * implementation. - */ -#include -#include - -struct mbus_device_id { - __u32 device; - __u32 vendor; -}; - -#define MBUS_DEV_DMA_HOST 2 -#define MBUS_DEV_DMA_MIC 3 -#define MBUS_DEV_ANY_ID 0xffffffff - -/** - * mbus_device - representation of a device using mbus - * @mmio_va: virtual address of mmio space - * @hw_ops: the hardware ops supported by this device. - * @id: the device type identification (used to match it with a driver). - * @dev: underlying device. - * be used to communicate with. - * @index: unique position on the mbus bus - */ -struct mbus_device { - void __iomem *mmio_va; - struct mbus_hw_ops *hw_ops; - struct mbus_device_id id; - struct device dev; - int index; -}; - -/** - * mbus_driver - operations for a mbus I/O driver - * @driver: underlying device driver (populate name and owner). - * @id_table: the ids serviced by this driver. - * @probe: the function to call when a device is found. Returns 0 or -errno. - * @remove: the function to call when a device is removed. - */ -struct mbus_driver { - struct device_driver driver; - const struct mbus_device_id *id_table; - int (*probe)(struct mbus_device *dev); - void (*scan)(struct mbus_device *dev); - void (*remove)(struct mbus_device *dev); -}; - -/** - * struct mic_irq - opaque pointer used as cookie - */ -struct mic_irq; - -/** - * mbus_hw_ops - Hardware operations for accessing a MIC device on the MIC bus. - */ -struct mbus_hw_ops { - struct mic_irq* (*request_threaded_irq)(struct mbus_device *mbdev, - irq_handler_t handler, - irq_handler_t thread_fn, - const char *name, void *data, - int intr_src); - void (*free_irq)(struct mbus_device *mbdev, - struct mic_irq *cookie, void *data); - void (*ack_interrupt)(struct mbus_device *mbdev, int num); -}; - -struct mbus_device * -mbus_register_device(struct device *pdev, int id, const struct dma_map_ops *dma_ops, - struct mbus_hw_ops *hw_ops, int index, - void __iomem *mmio_va); -void mbus_unregister_device(struct mbus_device *mbdev); - -int mbus_register_driver(struct mbus_driver *drv); -void mbus_unregister_driver(struct mbus_driver *drv); - -static inline struct mbus_device *dev_to_mbus(struct device *_dev) -{ - return container_of(_dev, struct mbus_device, dev); -} - -static inline struct mbus_driver *drv_to_mbus(struct device_driver *drv) -{ - return container_of(drv, struct mbus_driver, driver); -} - -#endif /* _MIC_BUS_H */ diff --git a/include/linux/scif.h b/include/linux/scif.h deleted file mode 100644 index 329e695b8fe5..000000000000 --- a/include/linux/scif.h +++ /dev/null @@ -1,1339 +0,0 @@ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2014 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Copyright(c) 2014 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Intel SCIF driver. - * - */ -#ifndef __SCIF_H__ -#define __SCIF_H__ - -#include -#include -#include -#include - -#define SCIF_ACCEPT_SYNC 1 -#define SCIF_SEND_BLOCK 1 -#define SCIF_RECV_BLOCK 1 - -enum { - SCIF_PROT_READ = (1 << 0), - SCIF_PROT_WRITE = (1 << 1) -}; - -enum { - SCIF_MAP_FIXED = 0x10, - SCIF_MAP_KERNEL = 0x20, -}; - -enum { - SCIF_FENCE_INIT_SELF = (1 << 0), - SCIF_FENCE_INIT_PEER = (1 << 1), - SCIF_SIGNAL_LOCAL = (1 << 4), - SCIF_SIGNAL_REMOTE = (1 << 5) -}; - -enum { - SCIF_RMA_USECPU = (1 << 0), - SCIF_RMA_USECACHE = (1 << 1), - SCIF_RMA_SYNC = (1 << 2), - SCIF_RMA_ORDERED = (1 << 3) -}; - -/* End of SCIF Admin Reserved Ports */ -#define SCIF_ADMIN_PORT_END 1024 - -/* End of SCIF Reserved Ports */ -#define SCIF_PORT_RSVD 1088 - -typedef struct scif_endpt *scif_epd_t; -typedef struct scif_pinned_pages *scif_pinned_pages_t; - -/** - * struct scif_range - SCIF registered range used in kernel mode - * @cookie: cookie used internally by SCIF - * @nr_pages: number of pages of PAGE_SIZE - * @prot_flags: R/W protection - * @phys_addr: Array of bus addresses - * @va: Array of kernel virtual addresses backed by the pages in the phys_addr - * array. The va is populated only when called on the host for a remote - * SCIF connection on MIC. This is required to support the use case of DMA - * between MIC and another device which is not a SCIF node e.g., an IB or - * ethernet NIC. - */ -struct scif_range { - void *cookie; - int nr_pages; - int prot_flags; - dma_addr_t *phys_addr; - void __iomem **va; -}; - -/** - * struct scif_pollepd - SCIF endpoint to be monitored via scif_poll - * @epd: SCIF endpoint - * @events: requested events - * @revents: returned events - */ -struct scif_pollepd { - scif_epd_t epd; - __poll_t events; - __poll_t revents; -}; - -/** - * scif_peer_dev - representation of a peer SCIF device - * - * Peer devices show up as PCIe devices for the mgmt node but not the cards. - * The mgmt node discovers all the cards on the PCIe bus and informs the other - * cards about their peers. Upon notification of a peer a node adds a peer - * device to the peer bus to maintain symmetry in the way devices are - * discovered across all nodes in the SCIF network. - * - * @dev: underlying device - * @dnode - The destination node which this device will communicate with. - */ -struct scif_peer_dev { - struct device dev; - u8 dnode; -}; - -/** - * scif_client - representation of a SCIF client - * @name: client name - * @probe - client method called when a peer device is registered - * @remove - client method called when a peer device is unregistered - * @si - subsys_interface used internally for implementing SCIF clients - */ -struct scif_client { - const char *name; - void (*probe)(struct scif_peer_dev *spdev); - void (*remove)(struct scif_peer_dev *spdev); - struct subsys_interface si; -}; - -#define SCIF_OPEN_FAILED ((scif_epd_t)-1) -#define SCIF_REGISTER_FAILED ((off_t)-1) -#define SCIF_MMAP_FAILED ((void *)-1) - -/** - * scif_open() - Create an endpoint - * - * Return: - * Upon successful completion, scif_open() returns an endpoint descriptor to - * be used in subsequent SCIF functions calls to refer to that endpoint; - * otherwise in user mode SCIF_OPEN_FAILED (that is ((scif_epd_t)-1)) is - * returned and errno is set to indicate the error; in kernel mode a NULL - * scif_epd_t is returned. - * - * Errors: - * ENOMEM - Insufficient kernel memory was available - */ -scif_epd_t scif_open(void); - -/** - * scif_bind() - Bind an endpoint to a port - * @epd: endpoint descriptor - * @pn: port number - * - * scif_bind() binds endpoint epd to port pn, where pn is a port number on the - * local node. If pn is zero, a port number greater than or equal to - * SCIF_PORT_RSVD is assigned and returned. Each endpoint may be bound to - * exactly one local port. Ports less than 1024 when requested can only be bound - * by system (or root) processes or by processes executed by privileged users. - * - * Return: - * Upon successful completion, scif_bind() returns the port number to which epd - * is bound; otherwise in user mode -1 is returned and errno is set to - * indicate the error; in kernel mode the negative of one of the following - * errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * EINVAL - the endpoint or the port is already bound - * EISCONN - The endpoint is already connected - * ENOSPC - No port number available for assignment - * EACCES - The port requested is protected and the user is not the superuser - */ -int scif_bind(scif_epd_t epd, u16 pn); - -/** - * scif_listen() - Listen for connections on an endpoint - * @epd: endpoint descriptor - * @backlog: maximum pending connection requests - * - * scif_listen() marks the endpoint epd as a listening endpoint - that is, as - * an endpoint that will be used to accept incoming connection requests. Once - * so marked, the endpoint is said to be in the listening state and may not be - * used as the endpoint of a connection. - * - * The endpoint, epd, must have been bound to a port. - * - * The backlog argument defines the maximum length to which the queue of - * pending connections for epd may grow. If a connection request arrives when - * the queue is full, the client may receive an error with an indication that - * the connection was refused. - * - * Return: - * Upon successful completion, scif_listen() returns 0; otherwise in user mode - * -1 is returned and errno is set to indicate the error; in kernel mode the - * negative of one of the following errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * EINVAL - the endpoint is not bound to a port - * EISCONN - The endpoint is already connected or listening - */ -int scif_listen(scif_epd_t epd, int backlog); - -/** - * scif_connect() - Initiate a connection on a port - * @epd: endpoint descriptor - * @dst: global id of port to which to connect - * - * The scif_connect() function requests the connection of endpoint epd to remote - * port dst. If the connection is successful, a peer endpoint, bound to dst, is - * created on node dst.node. On successful return, the connection is complete. - * - * If the endpoint epd has not already been bound to a port, scif_connect() - * will bind it to an unused local port. - * - * A connection is terminated when an endpoint of the connection is closed, - * either explicitly by scif_close(), or when a process that owns one of the - * endpoints of the connection is terminated. - * - * In user space, scif_connect() supports an asynchronous connection mode - * if the application has set the O_NONBLOCK flag on the endpoint via the - * fcntl() system call. Setting this flag will result in the calling process - * not to wait during scif_connect(). - * - * Return: - * Upon successful completion, scif_connect() returns the port ID to which the - * endpoint, epd, is bound; otherwise in user mode -1 is returned and errno is - * set to indicate the error; in kernel mode the negative of one of the - * following errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNREFUSED - The destination was not listening for connections or refused - * the connection request - * EINVAL - dst.port is not a valid port ID - * EISCONN - The endpoint is already connected - * ENOMEM - No buffer space is available - * ENODEV - The destination node does not exist, or the node is lost or existed, - * but is not currently in the network since it may have crashed - * ENOSPC - No port number available for assignment - * EOPNOTSUPP - The endpoint is listening and cannot be connected - */ -int scif_connect(scif_epd_t epd, struct scif_port_id *dst); - -/** - * scif_accept() - Accept a connection on an endpoint - * @epd: endpoint descriptor - * @peer: global id of port to which connected - * @newepd: new connected endpoint descriptor - * @flags: flags - * - * The scif_accept() call extracts the first connection request from the queue - * of pending connections for the port on which epd is listening. scif_accept() - * creates a new endpoint, bound to the same port as epd, and allocates a new - * SCIF endpoint descriptor, returned in newepd, for the endpoint. The new - * endpoint is connected to the endpoint through which the connection was - * requested. epd is unaffected by this call, and remains in the listening - * state. - * - * On successful return, peer holds the global port identifier (node id and - * local port number) of the port which requested the connection. - * - * A connection is terminated when an endpoint of the connection is closed, - * either explicitly by scif_close(), or when a process that owns one of the - * endpoints of the connection is terminated. - * - * The number of connections that can (subsequently) be accepted on epd is only - * limited by system resources (memory). - * - * The flags argument is formed by OR'ing together zero or more of the - * following values. - * SCIF_ACCEPT_SYNC - block until a connection request is presented. If - * SCIF_ACCEPT_SYNC is not in flags, and no pending - * connections are present on the queue, scif_accept() - * fails with an EAGAIN error - * - * In user mode, the select() and poll() functions can be used to determine - * when there is a connection request. In kernel mode, the scif_poll() - * function may be used for this purpose. A readable event will be delivered - * when a connection is requested. - * - * Return: - * Upon successful completion, scif_accept() returns 0; otherwise in user mode - * -1 is returned and errno is set to indicate the error; in kernel mode the - * negative of one of the following errors is returned. - * - * Errors: - * EAGAIN - SCIF_ACCEPT_SYNC is not set and no connections are present to be - * accepted or SCIF_ACCEPT_SYNC is not set and remote node failed to complete - * its connection request - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * EINTR - Interrupted function - * EINVAL - epd is not a listening endpoint, or flags is invalid, or peer is - * NULL, or newepd is NULL - * ENODEV - The requesting node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOMEM - Not enough space - * ENOENT - Secondary part of epd registration failed - */ -int scif_accept(scif_epd_t epd, struct scif_port_id *peer, scif_epd_t - *newepd, int flags); - -/** - * scif_close() - Close an endpoint - * @epd: endpoint descriptor - * - * scif_close() closes an endpoint and performs necessary teardown of - * facilities associated with that endpoint. - * - * If epd is a listening endpoint then it will no longer accept connection - * requests on the port to which it is bound. Any pending connection requests - * are rejected. - * - * If epd is a connected endpoint, then its peer endpoint is also closed. RMAs - * which are in-process through epd or its peer endpoint will complete before - * scif_close() returns. Registered windows of the local and peer endpoints are - * released as if scif_unregister() was called against each window. - * - * Closing a SCIF endpoint does not affect local registered memory mapped by - * a SCIF endpoint on a remote node. The local memory remains mapped by the peer - * SCIF endpoint explicitly removed by calling munmap(..) by the peer. - * - * If the peer endpoint's receive queue is not empty at the time that epd is - * closed, then the peer endpoint can be passed as the endpoint parameter to - * scif_recv() until the receive queue is empty. - * - * epd is freed and may no longer be accessed. - * - * Return: - * Upon successful completion, scif_close() returns 0; otherwise in user mode - * -1 is returned and errno is set to indicate the error; in kernel mode the - * negative of one of the following errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - */ -int scif_close(scif_epd_t epd); - -/** - * scif_send() - Send a message - * @epd: endpoint descriptor - * @msg: message buffer address - * @len: message length - * @flags: blocking mode flags - * - * scif_send() sends data to the peer of endpoint epd. Up to len bytes of data - * are copied from memory starting at address msg. On successful execution the - * return value of scif_send() is the number of bytes that were sent, and is - * zero if no bytes were sent because len was zero. scif_send() may be called - * only when the endpoint is in a connected state. - * - * If a scif_send() call is non-blocking, then it sends only those bytes which - * can be sent without waiting, up to a maximum of len bytes. - * - * If a scif_send() call is blocking, then it normally returns after sending - * all len bytes. If a blocking call is interrupted or the connection is - * reset, the call is considered successful if some bytes were sent or len is - * zero, otherwise the call is considered unsuccessful. - * - * In user mode, the select() and poll() functions can be used to determine - * when the send queue is not full. In kernel mode, the scif_poll() function - * may be used for this purpose. - * - * It is recommended that scif_send()/scif_recv() only be used for short - * control-type message communication between SCIF endpoints. The SCIF RMA - * APIs are expected to provide better performance for transfer sizes of - * 1024 bytes or longer for the current MIC hardware and software - * implementation. - * - * scif_send() will block until the entire message is sent if SCIF_SEND_BLOCK - * is passed as the flags argument. - * - * Return: - * Upon successful completion, scif_send() returns the number of bytes sent; - * otherwise in user mode -1 is returned and errno is set to indicate the - * error; in kernel mode the negative of one of the following errors is - * returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - flags is invalid, or len is negative - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOMEM - Not enough space - * ENOTCONN - The endpoint is not connected - */ -int scif_send(scif_epd_t epd, void *msg, int len, int flags); - -/** - * scif_recv() - Receive a message - * @epd: endpoint descriptor - * @msg: message buffer address - * @len: message buffer length - * @flags: blocking mode flags - * - * scif_recv() receives data from the peer of endpoint epd. Up to len bytes of - * data are copied to memory starting at address msg. On successful execution - * the return value of scif_recv() is the number of bytes that were received, - * and is zero if no bytes were received because len was zero. scif_recv() may - * be called only when the endpoint is in a connected state. - * - * If a scif_recv() call is non-blocking, then it receives only those bytes - * which can be received without waiting, up to a maximum of len bytes. - * - * If a scif_recv() call is blocking, then it normally returns after receiving - * all len bytes. If the blocking call was interrupted due to a disconnection, - * subsequent calls to scif_recv() will copy all bytes received upto the point - * of disconnection. - * - * In user mode, the select() and poll() functions can be used to determine - * when data is available to be received. In kernel mode, the scif_poll() - * function may be used for this purpose. - * - * It is recommended that scif_send()/scif_recv() only be used for short - * control-type message communication between SCIF endpoints. The SCIF RMA - * APIs are expected to provide better performance for transfer sizes of - * 1024 bytes or longer for the current MIC hardware and software - * implementation. - * - * scif_recv() will block until the entire message is received if - * SCIF_RECV_BLOCK is passed as the flags argument. - * - * Return: - * Upon successful completion, scif_recv() returns the number of bytes - * received; otherwise in user mode -1 is returned and errno is set to - * indicate the error; in kernel mode the negative of one of the following - * errors is returned. - * - * Errors: - * EAGAIN - The destination node is returning from a low power state - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - flags is invalid, or len is negative - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOMEM - Not enough space - * ENOTCONN - The endpoint is not connected - */ -int scif_recv(scif_epd_t epd, void *msg, int len, int flags); - -/** - * scif_register() - Mark a memory region for remote access. - * @epd: endpoint descriptor - * @addr: starting virtual address - * @len: length of range - * @offset: offset of window - * @prot_flags: read/write protection flags - * @map_flags: mapping flags - * - * The scif_register() function opens a window, a range of whole pages of the - * registered address space of the endpoint epd, starting at offset po and - * continuing for len bytes. The value of po, further described below, is a - * function of the parameters offset and len, and the value of map_flags. Each - * page of the window represents the physical memory page which backs the - * corresponding page of the range of virtual address pages starting at addr - * and continuing for len bytes. addr and len are constrained to be multiples - * of the page size. A successful scif_register() call returns po. - * - * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset - * exactly, and offset is constrained to be a multiple of the page size. The - * mapping established by scif_register() will not replace any existing - * registration; an error is returned if any page within the range [offset, - * offset + len - 1] intersects an existing window. - * - * When SCIF_MAP_FIXED is not set, the implementation uses offset in an - * implementation-defined manner to arrive at po. The po value so chosen will - * be an area of the registered address space that the implementation deems - * suitable for a mapping of len bytes. An offset value of 0 is interpreted as - * granting the implementation complete freedom in selecting po, subject to - * constraints described below. A non-zero value of offset is taken to be a - * suggestion of an offset near which the mapping should be placed. When the - * implementation selects a value for po, it does not replace any extant - * window. In all cases, po will be a multiple of the page size. - * - * The physical pages which are so represented by a window are available for - * access in calls to mmap(), scif_readfrom(), scif_writeto(), - * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the - * physical pages represented by the window will not be reused by the memory - * subsystem for any other purpose. Note that the same physical page may be - * represented by multiple windows. - * - * Subsequent operations which change the memory pages to which virtual - * addresses are mapped (such as mmap(), munmap()) have no effect on - * existing window. - * - * If the process will fork(), it is recommended that the registered - * virtual address range be marked with MADV_DONTFORK. Doing so will prevent - * problems due to copy-on-write semantics. - * - * The prot_flags argument is formed by OR'ing together one or more of the - * following values. - * SCIF_PROT_READ - allow read operations from the window - * SCIF_PROT_WRITE - allow write operations to the window - * - * Return: - * Upon successful completion, scif_register() returns the offset at which the - * mapping was placed (po); otherwise in user mode SCIF_REGISTER_FAILED (that - * is (off_t *)-1) is returned and errno is set to indicate the error; in - * kernel mode the negative of one of the following errors is returned. - * - * Errors: - * EADDRINUSE - SCIF_MAP_FIXED is set in map_flags, and pages in the range - * [offset, offset + len -1] are already registered - * EAGAIN - The mapping could not be performed due to lack of resources - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - map_flags is invalid, or prot_flags is invalid, or SCIF_MAP_FIXED is - * set in flags, and offset is not a multiple of the page size, or addr is not a - * multiple of the page size, or len is not a multiple of the page size, or is - * 0, or offset is negative - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOMEM - Not enough space - * ENOTCONN -The endpoint is not connected - */ -off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset, - int prot_flags, int map_flags); - -/** - * scif_unregister() - Mark a memory region for remote access. - * @epd: endpoint descriptor - * @offset: start of range to unregister - * @len: length of range to unregister - * - * The scif_unregister() function closes those previously registered windows - * which are entirely within the range [offset, offset + len - 1]. It is an - * error to specify a range which intersects only a subrange of a window. - * - * On a successful return, pages within the window may no longer be specified - * in calls to mmap(), scif_readfrom(), scif_writeto(), scif_vreadfrom(), - * scif_vwriteto(), scif_get_pages, and scif_fence_signal(). The window, - * however, continues to exist until all previous references against it are - * removed. A window is referenced if there is a mapping to it created by - * mmap(), or if scif_get_pages() was called against the window - * (and the pages have not been returned via scif_put_pages()). A window is - * also referenced while an RMA, in which some range of the window is a source - * or destination, is in progress. Finally a window is referenced while some - * offset in that window was specified to scif_fence_signal(), and the RMAs - * marked by that call to scif_fence_signal() have not completed. While a - * window is in this state, its registered address space pages are not - * available for use in a new registered window. - * - * When all such references to the window have been removed, its references to - * all the physical pages which it represents are removed. Similarly, the - * registered address space pages of the window become available for - * registration in a new window. - * - * Return: - * Upon successful completion, scif_unregister() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. In the event of an - * error, no windows are unregistered. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - the range [offset, offset + len - 1] intersects a subrange of a - * window, or offset is negative - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - Offsets in the range [offset, offset + len - 1] are invalid for the - * registered address space of epd - */ -int scif_unregister(scif_epd_t epd, off_t offset, size_t len); - -/** - * scif_readfrom() - Copy from a remote address space - * @epd: endpoint descriptor - * @loffset: offset in local registered address space to - * which to copy - * @len: length of range to copy - * @roffset: offset in remote registered address space - * from which to copy - * @rma_flags: transfer mode flags - * - * scif_readfrom() copies len bytes from the remote registered address space of - * the peer of endpoint epd, starting at the offset roffset to the local - * registered address space of epd, starting at the offset loffset. - * - * Each of the specified ranges [loffset, loffset + len - 1] and [roffset, - * roffset + len - 1] must be within some registered window or windows of the - * local and remote nodes. A range may intersect multiple registered windows, - * but only if those windows are contiguous in the registered address space. - * - * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using - * programmed read/writes. Otherwise the data is copied using DMA. If rma_- - * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the - * transfer is complete. Otherwise, the transfer may be performed asynchron- - * ously. The order in which any two asynchronous RMA operations complete - * is non-deterministic. The synchronization functions, scif_fence_mark()/ - * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to - * the completion of asynchronous RMA operations on the same endpoint. - * - * The DMA transfer of individual bytes is not guaranteed to complete in - * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last - * cacheline or partial cacheline of the source range will become visible on - * the destination node after all other transferred data in the source - * range has become visible on the destination node. - * - * The optimal DMA performance will likely be realized if both - * loffset and roffset are cacheline aligned (are a multiple of 64). Lower - * performance will likely be realized if loffset and roffset are not - * cacheline aligned but are separated by some multiple of 64. The lowest level - * of performance is likely if loffset and roffset are not separated by a - * multiple of 64. - * - * The rma_flags argument is formed by ORing together zero or more of the - * following values. - * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA - * engine. - * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the - * transfer has completed. Passing this flag results in the - * current implementation busy waiting and consuming CPU cycles - * while the DMA transfer is in progress for best performance by - * avoiding the interrupt latency. - * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of - * the source range becomes visible on the destination node - * after all other transferred data in the source range has - * become visible on the destination - * - * Return: - * Upon successful completion, scif_readfrom() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. - * - * Errors: - * EACCES - Attempt to write to a read-only range - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - rma_flags is invalid - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered - * address space of epd, or, The range [roffset, roffset + len - 1] is invalid - * for the registered address space of the peer of epd, or loffset or roffset - * is negative - */ -int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, off_t - roffset, int rma_flags); - -/** - * scif_writeto() - Copy to a remote address space - * @epd: endpoint descriptor - * @loffset: offset in local registered address space - * from which to copy - * @len: length of range to copy - * @roffset: offset in remote registered address space to - * which to copy - * @rma_flags: transfer mode flags - * - * scif_writeto() copies len bytes from the local registered address space of - * epd, starting at the offset loffset to the remote registered address space - * of the peer of endpoint epd, starting at the offset roffset. - * - * Each of the specified ranges [loffset, loffset + len - 1] and [roffset, - * roffset + len - 1] must be within some registered window or windows of the - * local and remote nodes. A range may intersect multiple registered windows, - * but only if those windows are contiguous in the registered address space. - * - * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using - * programmed read/writes. Otherwise the data is copied using DMA. If rma_- - * flags includes SCIF_RMA_SYNC, then scif_writeto() will return after the - * transfer is complete. Otherwise, the transfer may be performed asynchron- - * ously. The order in which any two asynchronous RMA operations complete - * is non-deterministic. The synchronization functions, scif_fence_mark()/ - * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to - * the completion of asynchronous RMA operations on the same endpoint. - * - * The DMA transfer of individual bytes is not guaranteed to complete in - * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last - * cacheline or partial cacheline of the source range will become visible on - * the destination node after all other transferred data in the source - * range has become visible on the destination node. - * - * The optimal DMA performance will likely be realized if both - * loffset and roffset are cacheline aligned (are a multiple of 64). Lower - * performance will likely be realized if loffset and roffset are not cacheline - * aligned but are separated by some multiple of 64. The lowest level of - * performance is likely if loffset and roffset are not separated by a multiple - * of 64. - * - * The rma_flags argument is formed by ORing together zero or more of the - * following values. - * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA - * engine. - * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the - * transfer has completed. Passing this flag results in the - * current implementation busy waiting and consuming CPU cycles - * while the DMA transfer is in progress for best performance by - * avoiding the interrupt latency. - * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of - * the source range becomes visible on the destination node - * after all other transferred data in the source range has - * become visible on the destination - * - * Return: - * Upon successful completion, scif_readfrom() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. - * - * Errors: - * EACCES - Attempt to write to a read-only range - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - rma_flags is invalid - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered - * address space of epd, or, The range [roffset , roffset + len -1] is invalid - * for the registered address space of the peer of epd, or loffset or roffset - * is negative - */ -int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, off_t - roffset, int rma_flags); - -/** - * scif_vreadfrom() - Copy from a remote address space - * @epd: endpoint descriptor - * @addr: address to which to copy - * @len: length of range to copy - * @roffset: offset in remote registered address space - * from which to copy - * @rma_flags: transfer mode flags - * - * scif_vreadfrom() copies len bytes from the remote registered address - * space of the peer of endpoint epd, starting at the offset roffset, to local - * memory, starting at addr. - * - * The specified range [roffset, roffset + len - 1] must be within some - * registered window or windows of the remote nodes. The range may - * intersect multiple registered windows, but only if those windows are - * contiguous in the registered address space. - * - * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using - * programmed read/writes. Otherwise the data is copied using DMA. If rma_- - * flags includes SCIF_RMA_SYNC, then scif_vreadfrom() will return after the - * transfer is complete. Otherwise, the transfer may be performed asynchron- - * ously. The order in which any two asynchronous RMA operations complete - * is non-deterministic. The synchronization functions, scif_fence_mark()/ - * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to - * the completion of asynchronous RMA operations on the same endpoint. - * - * The DMA transfer of individual bytes is not guaranteed to complete in - * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last - * cacheline or partial cacheline of the source range will become visible on - * the destination node after all other transferred data in the source - * range has become visible on the destination node. - * - * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back - * the specified local memory range may be remain in a pinned state even after - * the specified transfer completes. This may reduce overhead if some or all of - * the same virtual address range is referenced in a subsequent call of - * scif_vreadfrom() or scif_vwriteto(). - * - * The optimal DMA performance will likely be realized if both - * addr and roffset are cacheline aligned (are a multiple of 64). Lower - * performance will likely be realized if addr and roffset are not - * cacheline aligned but are separated by some multiple of 64. The lowest level - * of performance is likely if addr and roffset are not separated by a - * multiple of 64. - * - * The rma_flags argument is formed by ORing together zero or more of the - * following values. - * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA - * engine. - * SCIF_RMA_USECACHE - enable registration caching - * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the - * transfer has completed. Passing this flag results in the - * current implementation busy waiting and consuming CPU cycles - * while the DMA transfer is in progress for best performance by - * avoiding the interrupt latency. - * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of - * the source range becomes visible on the destination node - * after all other transferred data in the source range has - * become visible on the destination - * - * Return: - * Upon successful completion, scif_vreadfrom() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. - * - * Errors: - * EACCES - Attempt to write to a read-only range - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - rma_flags is invalid - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the - * registered address space of epd - */ -int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t roffset, - int rma_flags); - -/** - * scif_vwriteto() - Copy to a remote address space - * @epd: endpoint descriptor - * @addr: address from which to copy - * @len: length of range to copy - * @roffset: offset in remote registered address space to - * which to copy - * @rma_flags: transfer mode flags - * - * scif_vwriteto() copies len bytes from the local memory, starting at addr, to - * the remote registered address space of the peer of endpoint epd, starting at - * the offset roffset. - * - * The specified range [roffset, roffset + len - 1] must be within some - * registered window or windows of the remote nodes. The range may intersect - * multiple registered windows, but only if those windows are contiguous in the - * registered address space. - * - * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using - * programmed read/writes. Otherwise the data is copied using DMA. If rma_- - * flags includes SCIF_RMA_SYNC, then scif_vwriteto() will return after the - * transfer is complete. Otherwise, the transfer may be performed asynchron- - * ously. The order in which any two asynchronous RMA operations complete - * is non-deterministic. The synchronization functions, scif_fence_mark()/ - * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to - * the completion of asynchronous RMA operations on the same endpoint. - * - * The DMA transfer of individual bytes is not guaranteed to complete in - * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last - * cacheline or partial cacheline of the source range will become visible on - * the destination node after all other transferred data in the source - * range has become visible on the destination node. - * - * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back - * the specified local memory range may be remain in a pinned state even after - * the specified transfer completes. This may reduce overhead if some or all of - * the same virtual address range is referenced in a subsequent call of - * scif_vreadfrom() or scif_vwriteto(). - * - * The optimal DMA performance will likely be realized if both - * addr and offset are cacheline aligned (are a multiple of 64). Lower - * performance will likely be realized if addr and offset are not cacheline - * aligned but are separated by some multiple of 64. The lowest level of - * performance is likely if addr and offset are not separated by a multiple of - * 64. - * - * The rma_flags argument is formed by ORing together zero or more of the - * following values. - * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA - * engine. - * SCIF_RMA_USECACHE - allow registration caching - * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the - * transfer has completed. Passing this flag results in the - * current implementation busy waiting and consuming CPU cycles - * while the DMA transfer is in progress for best performance by - * avoiding the interrupt latency. - * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of - * the source range becomes visible on the destination node - * after all other transferred data in the source range has - * become visible on the destination - * - * Return: - * Upon successful completion, scif_vwriteto() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. - * - * Errors: - * EACCES - Attempt to write to a read-only range - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - rma_flags is invalid - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the - * registered address space of epd - */ -int scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t roffset, - int rma_flags); - -/** - * scif_fence_mark() - Mark previously issued RMAs - * @epd: endpoint descriptor - * @flags: control flags - * @mark: marked value returned as output. - * - * scif_fence_mark() returns after marking the current set of all uncompleted - * RMAs initiated through the endpoint epd or the current set of all - * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are - * marked with a value returned at mark. The application may subsequently call - * scif_fence_wait(), passing the value returned at mark, to await completion - * of all RMAs so marked. - * - * The flags argument has exactly one of the following values. - * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint - * epd are marked - * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer - * of endpoint epd are marked - * - * Return: - * Upon successful completion, scif_fence_mark() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - flags is invalid - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENOMEM - Insufficient kernel memory was available - */ -int scif_fence_mark(scif_epd_t epd, int flags, int *mark); - -/** - * scif_fence_wait() - Wait for completion of marked RMAs - * @epd: endpoint descriptor - * @mark: mark request - * - * scif_fence_wait() returns after all RMAs marked with mark have completed. - * The value passed in mark must have been obtained in a previous call to - * scif_fence_mark(). - * - * Return: - * Upon successful completion, scif_fence_wait() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENOMEM - Insufficient kernel memory was available - */ -int scif_fence_wait(scif_epd_t epd, int mark); - -/** - * scif_fence_signal() - Request a memory update on completion of RMAs - * @epd: endpoint descriptor - * @loff: local offset - * @lval: local value to write to loffset - * @roff: remote offset - * @rval: remote value to write to roffset - * @flags: flags - * - * scif_fence_signal() returns after marking the current set of all uncompleted - * RMAs initiated through the endpoint epd or marking the current set of all - * uncompleted RMAs initiated through the peer of endpoint epd. - * - * If flags includes SCIF_SIGNAL_LOCAL, then on completion of the RMAs in the - * marked set, lval is written to memory at the address corresponding to offset - * loff in the local registered address space of epd. loff must be within a - * registered window. If flags includes SCIF_SIGNAL_REMOTE, then on completion - * of the RMAs in the marked set, rval is written to memory at the address - * corresponding to offset roff in the remote registered address space of epd. - * roff must be within a remote registered window of the peer of epd. Note - * that any specified offset must be DWORD (4 byte / 32 bit) aligned. - * - * The flags argument is formed by OR'ing together the following. - * Exactly one of the following values. - * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint - * epd are marked - * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer - * of endpoint epd are marked - * One or more of the following values. - * SCIF_SIGNAL_LOCAL - On completion of the marked set of RMAs, write lval to - * memory at the address corresponding to offset loff in the local - * registered address space of epd. - * SCIF_SIGNAL_REMOTE - On completion of the marked set of RMAs, write rval to - * memory at the address corresponding to offset roff in the remote - * registered address space of epd. - * - * Return: - * Upon successful completion, scif_fence_signal() returns 0; otherwise in - * user mode -1 is returned and errno is set to indicate the error; in kernel - * mode the negative of one of the following errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - flags is invalid, or loff or roff are not DWORD aligned - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - loff is invalid for the registered address of epd, or roff is invalid - * for the registered address space, of the peer of epd - */ -int scif_fence_signal(scif_epd_t epd, off_t loff, u64 lval, off_t roff, - u64 rval, int flags); - -/** - * scif_get_node_ids() - Return information about online nodes - * @nodes: array in which to return online node IDs - * @len: number of entries in the nodes array - * @self: address to place the node ID of the local node - * - * scif_get_node_ids() fills in the nodes array with up to len node IDs of the - * nodes in the SCIF network. If there is not enough space in nodes, as - * indicated by the len parameter, only len node IDs are returned in nodes. The - * return value of scif_get_node_ids() is the total number of nodes currently in - * the SCIF network. By checking the return value against the len parameter, - * the user may determine if enough space for nodes was allocated. - * - * The node ID of the local node is returned at self. - * - * Return: - * Upon successful completion, scif_get_node_ids() returns the actual number of - * online nodes in the SCIF network including 'self'; otherwise in user mode - * -1 is returned and errno is set to indicate the error; in kernel mode no - * errors are returned. - */ -int scif_get_node_ids(u16 *nodes, int len, u16 *self); - -/** - * scif_pin_pages() - Pin a set of pages - * @addr: Virtual address of range to pin - * @len: Length of range to pin - * @prot_flags: Page protection flags - * @map_flags: Page classification flags - * @pinned_pages: Handle to pinned pages - * - * scif_pin_pages() pins (locks in physical memory) the physical pages which - * back the range of virtual address pages starting at addr and continuing for - * len bytes. addr and len are constrained to be multiples of the page size. A - * successful scif_pin_pages() call returns a handle to pinned_pages which may - * be used in subsequent calls to scif_register_pinned_pages(). - * - * The pages will remain pinned as long as there is a reference against the - * scif_pinned_pages_t value returned by scif_pin_pages() and until - * scif_unpin_pages() is called, passing the scif_pinned_pages_t value. A - * reference is added to a scif_pinned_pages_t value each time a window is - * created by calling scif_register_pinned_pages() and passing the - * scif_pinned_pages_t value. A reference is removed from a - * scif_pinned_pages_t value each time such a window is deleted. - * - * Subsequent operations which change the memory pages to which virtual - * addresses are mapped (such as mmap(), munmap()) have no effect on the - * scif_pinned_pages_t value or windows created against it. - * - * If the process will fork(), it is recommended that the registered - * virtual address range be marked with MADV_DONTFORK. Doing so will prevent - * problems due to copy-on-write semantics. - * - * The prot_flags argument is formed by OR'ing together one or more of the - * following values. - * SCIF_PROT_READ - allow read operations against the pages - * SCIF_PROT_WRITE - allow write operations against the pages - * The map_flags argument can be set as SCIF_MAP_KERNEL to interpret addr as a - * kernel space address. By default, addr is interpreted as a user space - * address. - * - * Return: - * Upon successful completion, scif_pin_pages() returns 0; otherwise the - * negative of one of the following errors is returned. - * - * Errors: - * EINVAL - prot_flags is invalid, map_flags is invalid, or offset is negative - * ENOMEM - Not enough space - */ -int scif_pin_pages(void *addr, size_t len, int prot_flags, int map_flags, - scif_pinned_pages_t *pinned_pages); - -/** - * scif_unpin_pages() - Unpin a set of pages - * @pinned_pages: Handle to pinned pages to be unpinned - * - * scif_unpin_pages() prevents scif_register_pinned_pages() from registering new - * windows against pinned_pages. The physical pages represented by pinned_pages - * will remain pinned until all windows previously registered against - * pinned_pages are deleted (the window is scif_unregister()'d and all - * references to the window are removed (see scif_unregister()). - * - * pinned_pages must have been obtain from a previous call to scif_pin_pages(). - * After calling scif_unpin_pages(), it is an error to pass pinned_pages to - * scif_register_pinned_pages(). - * - * Return: - * Upon successful completion, scif_unpin_pages() returns 0; otherwise the - * negative of one of the following errors is returned. - * - * Errors: - * EINVAL - pinned_pages is not valid - */ -int scif_unpin_pages(scif_pinned_pages_t pinned_pages); - -/** - * scif_register_pinned_pages() - Mark a memory region for remote access. - * @epd: endpoint descriptor - * @pinned_pages: Handle to pinned pages - * @offset: Registered address space offset - * @map_flags: Flags which control where pages are mapped - * - * The scif_register_pinned_pages() function opens a window, a range of whole - * pages of the registered address space of the endpoint epd, starting at - * offset po. The value of po, further described below, is a function of the - * parameters offset and pinned_pages, and the value of map_flags. Each page of - * the window represents a corresponding physical memory page of the range - * represented by pinned_pages; the length of the window is the same as the - * length of range represented by pinned_pages. A successful - * scif_register_pinned_pages() call returns po as the return value. - * - * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset - * exactly, and offset is constrained to be a multiple of the page size. The - * mapping established by scif_register_pinned_pages() will not replace any - * existing registration; an error is returned if any page of the new window - * would intersect an existing window. - * - * When SCIF_MAP_FIXED is not set, the implementation uses offset in an - * implementation-defined manner to arrive at po. The po so chosen will be an - * area of the registered address space that the implementation deems suitable - * for a mapping of the required size. An offset value of 0 is interpreted as - * granting the implementation complete freedom in selecting po, subject to - * constraints described below. A non-zero value of offset is taken to be a - * suggestion of an offset near which the mapping should be placed. When the - * implementation selects a value for po, it does not replace any extant - * window. In all cases, po will be a multiple of the page size. - * - * The physical pages which are so represented by a window are available for - * access in calls to scif_get_pages(), scif_readfrom(), scif_writeto(), - * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the - * physical pages represented by the window will not be reused by the memory - * subsystem for any other purpose. Note that the same physical page may be - * represented by multiple windows. - * - * Windows created by scif_register_pinned_pages() are unregistered by - * scif_unregister(). - * - * The map_flags argument can be set to SCIF_MAP_FIXED which interprets a - * fixed offset. - * - * Return: - * Upon successful completion, scif_register_pinned_pages() returns the offset - * at which the mapping was placed (po); otherwise the negative of one of the - * following errors is returned. - * - * Errors: - * EADDRINUSE - SCIF_MAP_FIXED is set in map_flags and pages in the new window - * would intersect an existing window - * EAGAIN - The mapping could not be performed due to lack of resources - * ECONNRESET - Connection reset by peer - * EINVAL - map_flags is invalid, or SCIF_MAP_FIXED is set in map_flags, and - * offset is not a multiple of the page size, or offset is negative - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOMEM - Not enough space - * ENOTCONN - The endpoint is not connected - */ -off_t scif_register_pinned_pages(scif_epd_t epd, - scif_pinned_pages_t pinned_pages, - off_t offset, int map_flags); - -/** - * scif_get_pages() - Add references to remote registered pages - * @epd: endpoint descriptor - * @offset: remote registered offset - * @len: length of range of pages - * @pages: returned scif_range structure - * - * scif_get_pages() returns the addresses of the physical pages represented by - * those pages of the registered address space of the peer of epd, starting at - * offset and continuing for len bytes. offset and len are constrained to be - * multiples of the page size. - * - * All of the pages in the specified range [offset, offset + len - 1] must be - * within a single window of the registered address space of the peer of epd. - * - * The addresses are returned as a virtually contiguous array pointed to by the - * phys_addr component of the scif_range structure whose address is returned in - * pages. The nr_pages component of scif_range is the length of the array. The - * prot_flags component of scif_range holds the protection flag value passed - * when the pages were registered. - * - * Each physical page whose address is returned by scif_get_pages() remains - * available and will not be released for reuse until the scif_range structure - * is returned in a call to scif_put_pages(). The scif_range structure returned - * by scif_get_pages() must be unmodified. - * - * It is an error to call scif_close() on an endpoint on which a scif_range - * structure of that endpoint has not been returned to scif_put_pages(). - * - * Return: - * Upon successful completion, scif_get_pages() returns 0; otherwise the - * negative of one of the following errors is returned. - * Errors: - * ECONNRESET - Connection reset by peer. - * EINVAL - offset is not a multiple of the page size, or offset is negative, or - * len is not a multiple of the page size - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - Offsets in the range [offset, offset + len - 1] are invalid - * for the registered address space of the peer epd - */ -int scif_get_pages(scif_epd_t epd, off_t offset, size_t len, - struct scif_range **pages); - -/** - * scif_put_pages() - Remove references from remote registered pages - * @pages: pages to be returned - * - * scif_put_pages() releases a scif_range structure previously obtained by - * calling scif_get_pages(). The physical pages represented by pages may - * be reused when the window which represented those pages is unregistered. - * Therefore, those pages must not be accessed after calling scif_put_pages(). - * - * Return: - * Upon successful completion, scif_put_pages() returns 0; otherwise the - * negative of one of the following errors is returned. - * Errors: - * EINVAL - pages does not point to a valid scif_range structure, or - * the scif_range structure pointed to by pages was already returned - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - */ -int scif_put_pages(struct scif_range *pages); - -/** - * scif_poll() - Wait for some event on an endpoint - * @epds: Array of endpoint descriptors - * @nepds: Length of epds - * @timeout: Upper limit on time for which scif_poll() will block - * - * scif_poll() waits for one of a set of endpoints to become ready to perform - * an I/O operation. - * - * The epds argument specifies the endpoint descriptors to be examined and the - * events of interest for each endpoint descriptor. epds is a pointer to an - * array with one member for each open endpoint descriptor of interest. - * - * The number of items in the epds array is specified in nepds. The epd field - * of scif_pollepd is an endpoint descriptor of an open endpoint. The field - * events is a bitmask specifying the events which the application is - * interested in. The field revents is an output parameter, filled by the - * kernel with the events that actually occurred. The bits returned in revents - * can include any of those specified in events, or one of the values EPOLLERR, - * EPOLLHUP, or EPOLLNVAL. (These three bits are meaningless in the events - * field, and will be set in the revents field whenever the corresponding - * condition is true.) - * - * If none of the events requested (and no error) has occurred for any of the - * endpoint descriptors, then scif_poll() blocks until one of the events occurs. - * - * The timeout argument specifies an upper limit on the time for which - * scif_poll() will block, in milliseconds. Specifying a negative value in - * timeout means an infinite timeout. - * - * The following bits may be set in events and returned in revents. - * EPOLLIN - Data may be received without blocking. For a connected - * endpoint, this means that scif_recv() may be called without blocking. For a - * listening endpoint, this means that scif_accept() may be called without - * blocking. - * EPOLLOUT - Data may be sent without blocking. For a connected endpoint, this - * means that scif_send() may be called without blocking. EPOLLOUT may also be - * used to block waiting for a non-blocking connect to complete. This bit value - * has no meaning for a listening endpoint and is ignored if specified. - * - * The following bits are only returned in revents, and are ignored if set in - * events. - * EPOLLERR - An error occurred on the endpoint - * EPOLLHUP - The connection to the peer endpoint was disconnected - * EPOLLNVAL - The specified endpoint descriptor is invalid. - * - * Return: - * Upon successful completion, scif_poll() returns a non-negative value. A - * positive value indicates the total number of endpoint descriptors that have - * been selected (that is, endpoint descriptors for which the revents member is - * non-zero). A value of 0 indicates that the call timed out and no endpoint - * descriptors have been selected. Otherwise in user mode -1 is returned and - * errno is set to indicate the error; in kernel mode the negative of one of - * the following errors is returned. - * - * Errors: - * EINTR - A signal occurred before any requested event - * EINVAL - The nepds argument is greater than {OPEN_MAX} - * ENOMEM - There was no space to allocate file descriptor tables - */ -int scif_poll(struct scif_pollepd *epds, unsigned int nepds, long timeout); - -/** - * scif_client_register() - Register a SCIF client - * @client: client to be registered - * - * scif_client_register() registers a SCIF client. The probe() method - * of the client is called when SCIF peer devices come online and the - * remove() method is called when the peer devices disappear. - * - * Return: - * Upon successful completion, scif_client_register() returns a non-negative - * value. Otherwise the return value is the same as subsys_interface_register() - * in the kernel. - */ -int scif_client_register(struct scif_client *client); - -/** - * scif_client_unregister() - Unregister a SCIF client - * @client: client to be unregistered - * - * scif_client_unregister() unregisters a SCIF client. - * - * Return: - * None - */ -void scif_client_unregister(struct scif_client *client); - -#endif /* __SCIF_H__ */ -- cgit v1.2.3 From a62f68f5ca53ab61cba2f0a410d0add7a6d54a52 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 23 Oct 2020 17:35:46 +0200 Subject: cpufreq: Introduce cpufreq_driver_test_flags() Add a helper function to test the flags of the cpufreq driver in use againt a given flags mask. In particular, this will be needed to test the CPUFREQ_NEED_UPDATE_LIMITS cpufreq driver flag in the schedutil governor. Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 038ed83aab41..1eaa04f1bae6 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -433,6 +433,7 @@ struct cpufreq_driver { int cpufreq_register_driver(struct cpufreq_driver *driver_data); int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); +bool cpufreq_driver_test_flags(u16 flags); const char *cpufreq_get_current_driver(void); void *cpufreq_get_driver_data(void); -- cgit v1.2.3 From 4169e889e5889405d54cec27d6e9f7f0ce3c7096 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 2 Sep 2020 23:25:55 -0500 Subject: include: jhash/signal: Fix fall-through warnings for Clang In preparation to enable -Wimplicit-fallthrough for Clang, explicitly add break statements instead of letting the code fall through to the next case. This patch adds four break statements that, together, fix almost 40,000 warnings when building Linux 5.10-rc1 with Clang 12.0.0 and this[1] change reverted. Notice that in order to enable -Wimplicit-fallthrough for Clang, such change[1] is meant to be reverted at some point. So, this patch helps to move in that direction. Something important to mention is that there is currently a discrepancy between GCC and Clang when dealing with switch fall-through to empty case statements or to cases that only contain a break/continue/return statement[2][3][4]. Now that the -Wimplicit-fallthrough option has been globally enabled[5], any compiler should really warn on missing either a fallthrough annotation or any of the other case-terminating statements (break/continue/return/ goto) when falling through to the next case statement. Making exceptions to this introduces variation in case handling which may continue to lead to bugs, misunderstandings, and a general lack of robustness. The point of enabling options like -Wimplicit-fallthrough is to prevent human error and aid developers in spotting bugs before their code is even built/ submitted/committed, therefore eliminating classes of bugs. So, in order to really accomplish this, we should, and can, move in the direction of addressing any error-prone scenarios and get rid of the unintentional fallthrough bug-class in the kernel, entirely, even if there is some minor redundancy. Better to have explicit case-ending statements than continue to have exceptions where one must guess as to the right result. The compiler will eliminate any actual redundancy. [1] commit e2079e93f562c ("kbuild: Do not enable -Wimplicit-fallthrough for clang for now") [2] https://github.com/ClangBuiltLinux/linux/issues/636 [3] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91432 [4] https://godbolt.org/z/xgkvIh [5] commit a035d552a93b ("Makefile: Globally enable fall-through warning") Co-developed-by: Kees Cook Signed-off-by: Kees Cook Signed-off-by: Gustavo A. R. Silva --- include/linux/jhash.h | 2 ++ include/linux/signal.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jhash.h b/include/linux/jhash.h index cfb62e9f37be..ab7f8c152b89 100644 --- a/include/linux/jhash.h +++ b/include/linux/jhash.h @@ -99,6 +99,7 @@ static inline u32 jhash(const void *key, u32 length, u32 initval) case 2: a += (u32)k[1]<<8; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); + break; case 0: /* Nothing left to add */ break; } @@ -136,6 +137,7 @@ static inline u32 jhash2(const u32 *k, u32 length, u32 initval) case 2: b += k[1]; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); + break; case 0: /* Nothing left to add */ break; } diff --git a/include/linux/signal.h b/include/linux/signal.h index 7bbc0e9cf084..b256f9c65661 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -238,6 +238,7 @@ static inline void siginitset(sigset_t *set, unsigned long mask) memset(&set->sig[1], 0, sizeof(long)*(_NSIG_WORDS-1)); break; case 2: set->sig[1] = 0; + break; case 1: ; } } @@ -250,6 +251,7 @@ static inline void siginitsetinv(sigset_t *set, unsigned long mask) memset(&set->sig[1], -1, sizeof(long)*(_NSIG_WORDS-1)); break; case 2: set->sig[1] = -1; + break; case 1: ; } } -- cgit v1.2.3 From a4147d855f50a676ebe61833a681f7c71945f343 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 31 Aug 2020 10:18:04 -0500 Subject: dmaengine: ti-cppi5: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9-rc1/process/deprecated.html#zero-length-and-one-element-arrays Signed-off-by: Gustavo A. R. Silva --- include/linux/dma/ti-cppi5.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma/ti-cppi5.h b/include/linux/dma/ti-cppi5.h index 5896441ee604..efa2f0309f00 100644 --- a/include/linux/dma/ti-cppi5.h +++ b/include/linux/dma/ti-cppi5.h @@ -47,7 +47,7 @@ struct cppi5_host_desc_t { u32 buf_info1; u32 org_buf_len; u64 org_buf_ptr; - u32 epib[0]; + u32 epib[]; } __packed; #define CPPI5_DESC_MIN_ALIGN (16U) @@ -139,7 +139,7 @@ struct cppi5_desc_epib_t { */ struct cppi5_monolithic_desc_t { struct cppi5_desc_hdr_t hdr; - u32 epib[0]; + u32 epib[]; }; #define CPPI5_INFO2_MDESC_DATA_OFFSET_SHIFT (18U) -- cgit v1.2.3 From 277ffd6c1ec0aa60856a03e18455fcca7d2a1186 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 31 Aug 2020 10:19:18 -0500 Subject: mailbox: zynqmp-ipi-message: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9-rc1/process/deprecated.html#zero-length-and-one-element-arrays Signed-off-by: Gustavo A. R. Silva --- include/linux/mailbox/zynqmp-ipi-message.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mailbox/zynqmp-ipi-message.h b/include/linux/mailbox/zynqmp-ipi-message.h index 9542b41eacfd..35ce84c8ca02 100644 --- a/include/linux/mailbox/zynqmp-ipi-message.h +++ b/include/linux/mailbox/zynqmp-ipi-message.h @@ -14,7 +14,7 @@ */ struct zynqmp_ipi_message { size_t len; - u8 data[0]; + u8 data[]; }; #endif /* _LINUX_ZYNQMP_IPI_MESSAGE_H_ */ -- cgit v1.2.3 From 883541051567a62add043a9f4ca5a31f2970bffd Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 31 Aug 2020 10:21:14 -0500 Subject: platform/chrome: cros_ec_commands: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9-rc1/process/deprecated.html#zero-length-and-one-element-arrays Signed-off-by: Gustavo A. R. Silva --- include/linux/platform_data/cros_ec_commands.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index 1fcfe9e63cb9..a3a9a878415f 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -1419,7 +1419,7 @@ struct ec_response_flash_info_2 { uint16_t num_banks_total; /* Number of banks described in banks array. */ uint16_t num_banks_desc; - struct ec_flash_bank banks[0]; + struct ec_flash_bank banks[]; } __ec_align4; /* @@ -2420,12 +2420,12 @@ struct ec_response_motion_sense_fifo_info { /* Total amount of vector lost */ uint16_t total_lost; /* Lost events since the last fifo_info, per sensors */ - uint16_t lost[0]; + uint16_t lost[]; } __ec_todo_packed; struct ec_response_motion_sense_fifo_data { uint32_t number_data; - struct ec_response_motion_sensor_data data[0]; + struct ec_response_motion_sensor_data data[]; } __ec_todo_packed; /* List supported activity recognition */ @@ -3093,7 +3093,7 @@ struct ec_response_tmp006_get_calibration_v1 { uint8_t algorithm; uint8_t num_params; uint8_t reserved[2]; - float val[0]; + float val[]; } __ec_align4; struct ec_params_tmp006_set_calibration_v1 { @@ -3101,7 +3101,7 @@ struct ec_params_tmp006_set_calibration_v1 { uint8_t algorithm; uint8_t num_params; uint8_t reserved; - float val[0]; + float val[]; } __ec_align4; @@ -5076,7 +5076,7 @@ struct ec_response_pd_log { uint8_t type; /* event type : see PD_EVENT_xx below */ uint8_t size_port; /* [7:5] port number [4:0] payload size in bytes */ uint16_t data; /* type-defined data payload */ - uint8_t payload[0]; /* optional additional data payload: 0..16 bytes */ + uint8_t payload[]; /* optional additional data payload: 0..16 bytes */ } __ec_align4; /* The timestamp is the microsecond counter shifted to get about a ms. */ @@ -5789,7 +5789,7 @@ struct ec_response_fp_encryption_status { struct ec_response_tp_frame_info { uint32_t n_frames; - uint32_t frame_sizes[0]; + uint32_t frame_sizes[]; } __ec_align4; /* Create a snapshot of current frame readings */ -- cgit v1.2.3 From 120088832042e6dc9866160ff267f8c347bf53e6 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 31 Aug 2020 10:21:55 -0500 Subject: platform/chrome: cros_ec_proto: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9-rc1/process/deprecated.html#zero-length-and-one-element-arrays Signed-off-by: Gustavo A. R. Silva --- include/linux/platform_data/cros_ec_proto.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 4a415ae851ef..02599687770c 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -69,7 +69,7 @@ struct cros_ec_command { uint32_t outsize; uint32_t insize; uint32_t result; - uint8_t data[0]; + uint8_t data[]; }; /** -- cgit v1.2.3 From 5e01fdff04b7f7c3b8d456c11c8a9f978b4ddf65 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 31 Aug 2020 08:25:42 -0500 Subject: fs: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9-rc1/process/deprecated.html#zero-length-and-one-element-arrays Signed-off-by: Gustavo A. R. Silva --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0bd126418bb6..21cc971fd960 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3285,7 +3285,7 @@ static inline ino_t parent_ino(struct dentry *dentry) */ struct simple_transaction_argresp { ssize_t size; - char data[0]; + char data[]; }; #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp)) -- cgit v1.2.3 From 080b6f40763565f65ebb9540219c71ce885cf568 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 28 Oct 2020 18:15:05 +0100 Subject: bpf: Don't rely on GCC __attribute__((optimize)) to disable GCSE Commit 3193c0836 ("bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()") introduced a __no_fgcse macro that expands to a function scope __attribute__((optimize("-fno-gcse"))), to disable a GCC specific optimization that was causing trouble on x86 builds, and was not expected to have any positive effect in the first place. However, as the GCC manual documents, __attribute__((optimize)) is not for production use, and results in all other optimization options to be forgotten for the function in question. This can cause all kinds of trouble, but in one particular reported case, it causes -fno-asynchronous-unwind-tables to be disregarded, resulting in .eh_frame info to be emitted for the function. This reverts commit 3193c0836, and instead, it disables the -fgcse optimization for the entire source file, but only when building for X86 using GCC with CONFIG_BPF_JIT_ALWAYS_ON disabled. Note that the original commit states that CONFIG_RETPOLINE=n triggers the issue, whereas CONFIG_RETPOLINE=y performs better without the optimization, so it is kept disabled in both cases. Fixes: 3193c0836f20 ("bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()") Signed-off-by: Ard Biesheuvel Signed-off-by: Alexei Starovoitov Tested-by: Geert Uytterhoeven Reviewed-by: Nick Desaulniers Link: https://lore.kernel.org/lkml/CAMuHMdUg0WJHEcq6to0-eODpXPOywLot6UD2=GFHpzoj_hCoBQ@mail.gmail.com/ Link: https://lore.kernel.org/bpf/20201028171506.15682-2-ardb@kernel.org --- include/linux/compiler-gcc.h | 2 -- include/linux/compiler_types.h | 4 ---- 2 files changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index d1e3c6896b71..5deb37024574 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -175,5 +175,3 @@ #else #define __diag_GCC_8(s) #endif - -#define __no_fgcse __attribute__((optimize("-fno-gcse"))) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 6e390d58a9f8..ac3fa37a84f9 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -247,10 +247,6 @@ struct ftrace_likely_data { #define asm_inline asm #endif -#ifndef __no_fgcse -# define __no_fgcse -#endif - /* Are two types/vars the same type (ignoring qualifiers)? */ #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) -- cgit v1.2.3 From 0d519cbf38eed4f895aed197d4b135fa7f60f7c2 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 23 Oct 2020 15:10:37 +0200 Subject: debugfs: remove return value of debugfs_create_devm_seqfile() No one checks the return value of debugfs_create_devm_seqfile(), as it's not needed, so make the return value void, so that no one tries to do so in the future. Link: https://lore.kernel.org/r/20201023131037.2500765-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/debugfs.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 851dd1f9a8a5..d6c4cc9ecc77 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -144,10 +144,9 @@ void debugfs_create_u32_array(const char *name, umode_t mode, struct dentry *parent, struct debugfs_u32_array *array); -struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name, - struct dentry *parent, - int (*read_fn)(struct seq_file *s, - void *data)); +void debugfs_create_devm_seqfile(struct device *dev, const char *name, + struct dentry *parent, + int (*read_fn)(struct seq_file *s, void *data)); bool debugfs_initialized(void); @@ -327,13 +326,12 @@ static inline void debugfs_create_u32_array(const char *name, umode_t mode, { } -static inline struct dentry *debugfs_create_devm_seqfile(struct device *dev, - const char *name, - struct dentry *parent, - int (*read_fn)(struct seq_file *s, - void *data)) +static inline void debugfs_create_devm_seqfile(struct device *dev, + const char *name, + struct dentry *parent, + int (*read_fn)(struct seq_file *s, + void *data)) { - return ERR_PTR(-ENODEV); } static inline ssize_t debugfs_read_file_bool(struct file *file, -- cgit v1.2.3 From 46d6c5ae953cc0be38efd0e469284df7c4328cf8 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 29 Oct 2020 03:56:06 +0100 Subject: netfilter: use actual socket sk rather than skb sk when routing harder If netfilter changes the packet mark when mangling, the packet is rerouted using the route_me_harder set of functions. Prior to this commit, there's one big difference between route_me_harder and the ordinary initial routing functions, described in the comment above __ip_queue_xmit(): /* Note: skb->sk can be different from sk, in case of tunnels */ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, That function goes on to correctly make use of sk->sk_bound_dev_if, rather than skb->sk->sk_bound_dev_if. And indeed the comment is true: a tunnel will receive a packet in ndo_start_xmit with an initial skb->sk. It will make some transformations to that packet, and then it will send the encapsulated packet out of a *new* socket. That new socket will basically always have a different sk_bound_dev_if (otherwise there'd be a routing loop). So for the purposes of routing the encapsulated packet, the routing information as it pertains to the socket should come from that socket's sk, rather than the packet's original skb->sk. For that reason __ip_queue_xmit() and related functions all do the right thing. One might argue that all tunnels should just call skb_orphan(skb) before transmitting the encapsulated packet into the new socket. But tunnels do *not* do this -- and this is wisely avoided in skb_scrub_packet() too -- because features like TSQ rely on skb->destructor() being called when that buffer space is truely available again. Calling skb_orphan(skb) too early would result in buffers filling up unnecessarily and accounting info being all wrong. Instead, additional routing must take into account the new sk, just as __ip_queue_xmit() notes. So, this commit addresses the problem by fishing the correct sk out of state->sk -- it's already set properly in the call to nf_hook() in __ip_local_out(), which receives the sk as part of its normal functionality. So we make sure to plumb state->sk through the various route_me_harder functions, and then make correct use of it following the example of __ip_queue_xmit(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Jason A. Donenfeld Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv4.h | 2 +- include/linux/netfilter_ipv6.h | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h index 082e2c41b7ff..5b70ca868bb1 100644 --- a/include/linux/netfilter_ipv4.h +++ b/include/linux/netfilter_ipv4.h @@ -16,7 +16,7 @@ struct ip_rt_info { u_int32_t mark; }; -int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned addr_type); +int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned addr_type); struct nf_queue_entry; diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 9b67394471e1..48314ade1506 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -42,7 +42,7 @@ struct nf_ipv6_ops { #if IS_MODULE(CONFIG_IPV6) int (*chk_addr)(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict); - int (*route_me_harder)(struct net *net, struct sk_buff *skb); + int (*route_me_harder)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*dev_get_saddr)(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr); @@ -143,9 +143,9 @@ static inline int nf_br_ip6_fragment(struct net *net, struct sock *sk, #endif } -int ip6_route_me_harder(struct net *net, struct sk_buff *skb); +int ip6_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb); -static inline int nf_ip6_route_me_harder(struct net *net, struct sk_buff *skb) +static inline int nf_ip6_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb) { #if IS_MODULE(CONFIG_IPV6) const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); @@ -153,9 +153,9 @@ static inline int nf_ip6_route_me_harder(struct net *net, struct sk_buff *skb) if (!v6_ops) return -EHOSTUNREACH; - return v6_ops->route_me_harder(net, skb); + return v6_ops->route_me_harder(net, sk, skb); #elif IS_BUILTIN(CONFIG_IPV6) - return ip6_route_me_harder(net, skb); + return ip6_route_me_harder(net, sk, skb); #else return -EHOSTUNREACH; #endif -- cgit v1.2.3 From c0391b6ab810381df632677a1dcbbbbd63d05b6d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 29 Oct 2020 13:50:03 +0100 Subject: netfilter: nf_tables: missing validation from the abort path If userspace does not include the trailing end of batch message, then nfnetlink aborts the transaction. This allows to check that ruleset updates trigger no errors. After this patch, invoking this command from the prerouting chain: # nft -c add rule x y fib saddr . oif type local fails since oif is not supported there. This patch fixes the lack of rule validation from the abort/check path to catch configuration errors such as the one above. Fixes: a654de8fdc18 ("netfilter: nf_tables: fix chain dependency validation") Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 89016d08f6a2..f6267e2883f2 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -24,6 +24,12 @@ struct nfnl_callback { const u_int16_t attr_count; /* number of nlattr's */ }; +enum nfnl_abort_action { + NFNL_ABORT_NONE = 0, + NFNL_ABORT_AUTOLOAD, + NFNL_ABORT_VALIDATE, +}; + struct nfnetlink_subsystem { const char *name; __u8 subsys_id; /* nfnetlink subsystem ID */ @@ -31,7 +37,8 @@ struct nfnetlink_subsystem { const struct nfnl_callback *cb; /* callback for individual types */ struct module *owner; int (*commit)(struct net *net, struct sk_buff *skb); - int (*abort)(struct net *net, struct sk_buff *skb, bool autoload); + int (*abort)(struct net *net, struct sk_buff *skb, + enum nfnl_abort_action action); void (*cleanup)(struct net *net); bool (*valid_genid)(struct net *net, u32 genid); }; -- cgit v1.2.3 From 290562075d4d9e85b7ff4104f9a634ffc3cccb69 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 27 Oct 2020 15:28:40 -0500 Subject: net/mlx5: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9/process/deprecated.html#zero-length-and-one-element-arrays Signed-off-by: Gustavo A. R. Silva --- include/linux/mlx5/mlx5_ifc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 651591a2965d..a092346c7b2d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -5823,7 +5823,7 @@ struct mlx5_ifc_alloc_modify_header_context_in_bits { u8 reserved_at_68[0x10]; u8 num_of_actions[0x8]; - union mlx5_ifc_set_add_copy_action_in_auto_bits actions[0]; + union mlx5_ifc_set_add_copy_action_in_auto_bits actions[]; }; struct mlx5_ifc_dealloc_modify_header_context_out_bits { @@ -9761,7 +9761,7 @@ struct mlx5_ifc_mcda_reg_bits { u8 reserved_at_60[0x20]; - u8 data[0][0x20]; + u8 data[][0x20]; }; enum { -- cgit v1.2.3 From f51778db088b2407ec177f2f4da0f6290602aa3f Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 2 Nov 2020 12:43:27 +1100 Subject: swiotlb: using SIZE_MAX needs limits.h included After merging the drm-misc tree, linux-next build (arm multi_v7_defconfig) failed like this: In file included from drivers/gpu/drm/nouveau/nouveau_ttm.c:26: include/linux/swiotlb.h: In function 'swiotlb_max_mapping_size': include/linux/swiotlb.h:99:9: error: 'SIZE_MAX' undeclared (first use in this function) 99 | return SIZE_MAX; | ^~~~~~~~ include/linux/swiotlb.h:7:1: note: 'SIZE_MAX' is defined in header ''; did you forget to '#include '? 6 | #include +++ |+#include 7 | #include include/linux/swiotlb.h:99:9: note: each undeclared identifier is reported only once for each function it appears in 99 | return SIZE_MAX; | ^~~~~~~~ Caused by commit abe420bfae52 ("swiotlb: Introduce swiotlb_max_mapping_size()") but only exposed by commit "drm/nouveu: fix swiotlb include" Fix it by including linux/limits.h as appropriate. Fixes: abe420bfae52 ("swiotlb: Introduce swiotlb_max_mapping_size()") Signed-off-by: Stephen Rothwell Link: https://lore.kernel.org/r/20201102124327.2f82b2a7@canb.auug.org.au Signed-off-by: Michael S. Tsirkin --- include/linux/swiotlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 046bb94bd4d6..fa5122c6711e 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -5,6 +5,7 @@ #include #include #include +#include struct device; struct page; -- cgit v1.2.3 From fc0021aa340af65a0a37d77be39e22aa886a6132 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Oct 2020 08:33:09 +0200 Subject: swiotlb: remove the tbl_dma_addr argument to swiotlb_tbl_map_single The tbl_dma_addr argument is used to check the DMA boundary for the allocations, and thus needs to be a dma_addr_t. swiotlb-xen instead passed a physical address, which could lead to incorrect results for strange offsets. Fix this by removing the parameter entirely and hard code the DMA address for io_tlb_start instead. Fixes: 91ffe4ad534a ("swiotlb-xen: introduce phys_to_dma/dma_to_phys translations") Signed-off-by: Christoph Hellwig Reviewed-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 513913ff7486..3bb72266a75a 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -45,13 +45,9 @@ enum dma_sync_target { SYNC_FOR_DEVICE = 1, }; -extern phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, - dma_addr_t tbl_dma_addr, - phys_addr_t phys, - size_t mapping_size, - size_t alloc_size, - enum dma_data_direction dir, - unsigned long attrs); +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, + size_t mapping_size, size_t alloc_size, + enum dma_data_direction dir, unsigned long attrs); extern void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, -- cgit v1.2.3 From e0e398e204634db8fb71bd89cf2f6e3e5bd09b51 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 21 Oct 2020 21:12:15 +0200 Subject: PM: runtime: Drop runtime PM references to supplier on link removal While removing a device link, drop the supplier device's runtime PM usage counter as many times as needed to drop all of the runtime PM references to it from the consumer in addition to dropping the consumer's link count. Fixes: baa8809f6097 ("PM / runtime: Optimize the use of device links") Signed-off-by: Rafael J. Wysocki Cc: 5.1+ # 5.1+ Tested-by: Xiang Chen Reviewed-by: Greg Kroah-Hartman --- include/linux/pm_runtime.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 18b02dcc168e..eadc1fdebce6 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -58,7 +58,7 @@ extern void pm_runtime_clean_up_links(struct device *dev); extern void pm_runtime_get_suppliers(struct device *dev); extern void pm_runtime_put_suppliers(struct device *dev); extern void pm_runtime_new_link(struct device *dev); -extern void pm_runtime_drop_link(struct device *dev); +extern void pm_runtime_drop_link(struct device_link *link); /** * pm_runtime_get_if_in_use - Conditionally bump up runtime PM usage counter. @@ -280,7 +280,7 @@ static inline void pm_runtime_clean_up_links(struct device *dev) {} static inline void pm_runtime_get_suppliers(struct device *dev) {} static inline void pm_runtime_put_suppliers(struct device *dev) {} static inline void pm_runtime_new_link(struct device *dev) {} -static inline void pm_runtime_drop_link(struct device *dev) {} +static inline void pm_runtime_drop_link(struct device_link *link) {} #endif /* !CONFIG_PM */ -- cgit v1.2.3 From d6e36668598154820177bfd78c1621d8e6c580a2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 21 Oct 2020 21:13:10 +0200 Subject: PM: runtime: Drop pm_runtime_clean_up_links() After commit d12544fb2aa9 ("PM: runtime: Remove link state checks in rpm_get/put_supplier()") nothing prevents the consumer device's runtime PM from acquiring additional references to the supplier device after pm_runtime_clean_up_links() has run (or even while it is running), so calling this function from __device_release_driver() may be pointless (or even harmful). Moreover, it ignores stateless device links, so the runtime PM handling of managed and stateless device links is inconsistent because of it, so better get rid of it entirely. Fixes: d12544fb2aa9 ("PM: runtime: Remove link state checks in rpm_get/put_supplier()") Signed-off-by: Rafael J. Wysocki Cc: 5.1+ # 5.1+ Tested-by: Xiang Chen Reviewed-by: Greg Kroah-Hartman --- include/linux/pm_runtime.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index eadc1fdebce6..4b708f4e8eed 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -54,7 +54,6 @@ extern u64 pm_runtime_autosuspend_expiration(struct device *dev); extern void pm_runtime_update_max_time_suspended(struct device *dev, s64 delta_ns); extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable); -extern void pm_runtime_clean_up_links(struct device *dev); extern void pm_runtime_get_suppliers(struct device *dev); extern void pm_runtime_put_suppliers(struct device *dev); extern void pm_runtime_new_link(struct device *dev); @@ -276,7 +275,6 @@ static inline u64 pm_runtime_autosuspend_expiration( struct device *dev) { return 0; } static inline void pm_runtime_set_memalloc_noio(struct device *dev, bool enable){} -static inline void pm_runtime_clean_up_links(struct device *dev) {} static inline void pm_runtime_get_suppliers(struct device *dev) {} static inline void pm_runtime_put_suppliers(struct device *dev) {} static inline void pm_runtime_new_link(struct device *dev) {} -- cgit v1.2.3 From f8f6ae5d077a9bdaf5cbf2ac960a5d1a04b47482 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 1 Nov 2020 17:08:00 -0800 Subject: mm: always have io_remap_pfn_range() set pgprot_decrypted() The purpose of io_remap_pfn_range() is to map IO memory, such as a memory mapped IO exposed through a PCI BAR. IO devices do not understand encryption, so this memory must always be decrypted. Automatically call pgprot_decrypted() as part of the generic implementation. This fixes a bug where enabling AMD SME causes subsystems, such as RDMA, using io_remap_pfn_range() to expose BAR pages to user space to fail. The CPU will encrypt access to those BAR pages instead of passing unencrypted IO directly to the device. Places not mapping IO should use remap_pfn_range(). Fixes: aca20d546214 ("x86/mm: Add support to make use of Secure Memory Encryption") Signed-off-by: Jason Gunthorpe Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Tom Lendacky Cc: Thomas Gleixner Cc: Andrey Ryabinin Cc: Borislav Petkov Cc: Brijesh Singh Cc: Jonathan Corbet Cc: Dmitry Vyukov Cc: "Dave Young" Cc: Alexander Potapenko Cc: Konrad Rzeszutek Wilk Cc: Andy Lutomirski Cc: Larry Woodman Cc: Matt Fleming Cc: Ingo Molnar Cc: "Michael S. Tsirkin" Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Rik van Riel Cc: Toshimitsu Kani Cc: Link: https://lkml.kernel.org/r/0-v1-025d64bdf6c4+e-amd_sme_fix_jgg@nvidia.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 9 +++++++++ include/linux/pgtable.h | 4 ---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ef360fe70aaf..db6ae4d3fb4e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2759,6 +2759,15 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } +#ifndef io_remap_pfn_range +static inline int io_remap_pfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); +} +#endif + static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 38c33eabea89..71125a4676c4 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1427,10 +1427,6 @@ typedef unsigned int pgtbl_mod_mask; #endif /* !__ASSEMBLY__ */ -#ifndef io_remap_pfn_range -#define io_remap_pfn_range remap_pfn_range -#endif - #ifndef has_transparent_hugepage #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define has_transparent_hugepage() 1 -- cgit v1.2.3 From 286228d382ba6320f04fa2e7c6fc8d4d92e428f4 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 18 Dec 2019 09:39:02 +0100 Subject: can: can_create_echo_skb(): fix echo skb generation: always use skb_clone() All user space generated SKBs are owned by a socket (unless injected into the key via AF_PACKET). If a socket is closed, all associated skbs will be cleaned up. This leads to a problem when a CAN driver calls can_put_echo_skb() on a unshared SKB. If the socket is closed prior to the TX complete handler, can_get_echo_skb() and the subsequent delivering of the echo SKB to all registered callbacks, a SKB with a refcount of 0 is delivered. To avoid the problem, in can_get_echo_skb() the original SKB is now always cloned, regardless of shared SKB or not. If the process exists it can now safely discard its SKBs, without disturbing the delivery of the echo SKB. The problem shows up in the j1939 stack, when it clones the incoming skb, which detects the already 0 refcount. We can easily reproduce this with following example: testj1939 -B -r can0: & cansend can0 1823ff40#0123 WARNING: CPU: 0 PID: 293 at lib/refcount.c:25 refcount_warn_saturate+0x108/0x174 refcount_t: addition on 0; use-after-free. Modules linked in: coda_vpu imx_vdoa videobuf2_vmalloc dw_hdmi_ahb_audio vcan CPU: 0 PID: 293 Comm: cansend Not tainted 5.5.0-rc6-00376-g9e20dcb7040d #1 Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree) Backtrace: [] (dump_backtrace) from [] (show_stack+0x20/0x24) [] (show_stack) from [] (dump_stack+0x8c/0xa0) [] (dump_stack) from [] (__warn+0xe0/0x108) [] (__warn) from [] (warn_slowpath_fmt+0xa8/0xcc) [] (warn_slowpath_fmt) from [] (refcount_warn_saturate+0x108/0x174) [] (refcount_warn_saturate) from [] (j1939_can_recv+0x20c/0x210) [] (j1939_can_recv) from [] (can_rcv_filter+0xb4/0x268) [] (can_rcv_filter) from [] (can_receive+0xb0/0xe4) [] (can_receive) from [] (can_rcv+0x48/0x98) [] (can_rcv) from [] (__netif_receive_skb_one_core+0x64/0x88) [] (__netif_receive_skb_one_core) from [] (__netif_receive_skb+0x38/0x94) [] (__netif_receive_skb) from [] (netif_receive_skb_internal+0x64/0xf8) [] (netif_receive_skb_internal) from [] (netif_receive_skb+0x34/0x19c) [] (netif_receive_skb) from [] (can_rx_offload_napi_poll+0x58/0xb4) Fixes: 0ae89beb283a ("can: add destructor for self generated skbs") Signed-off-by: Oleksij Rempel Link: http://lore.kernel.org/r/20200124132656.22156-1-o.rempel@pengutronix.de Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/linux/can/skb.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index 900b9f4e0605..fc61cf4eff1c 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -61,21 +61,17 @@ static inline void can_skb_set_owner(struct sk_buff *skb, struct sock *sk) */ static inline struct sk_buff *can_create_echo_skb(struct sk_buff *skb) { - if (skb_shared(skb)) { - struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + struct sk_buff *nskb; - if (likely(nskb)) { - can_skb_set_owner(nskb, skb->sk); - consume_skb(skb); - return nskb; - } else { - kfree_skb(skb); - return NULL; - } + nskb = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!nskb)) { + kfree_skb(skb); + return NULL; } - /* we can assume to have an unshared skb with proper owner */ - return skb; + can_skb_set_owner(nskb, skb->sk); + consume_skb(skb); + return nskb; } #endif /* !_CAN_SKB_H */ -- cgit v1.2.3 From 763e4cdc0f6d5cea45c896fef67f7be4bdefcca7 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 29 Oct 2020 14:30:48 -0700 Subject: iomap: support partial page discard on writeback block mapping failure iomap writeback mapping failure only calls into ->discard_page() if the current page has not been added to the ioend. Accordingly, the XFS callback assumes a full page discard and invalidation. This is problematic for sub-page block size filesystems where some portion of a page might have been mapped successfully before a failure to map a delalloc block occurs. ->discard_page() is not called in that error scenario and the bio is explicitly failed by iomap via the error return from ->prepare_ioend(). As a result, the filesystem leaks delalloc blocks and corrupts the filesystem block counters. Since XFS is the only user of ->discard_page(), tweak the semantics to invoke the callback unconditionally on mapping errors and provide the file offset that failed to map. Update xfs_discard_page() to discard the corresponding portion of the file and pass the range along to iomap_invalidatepage(). The latter already properly handles both full and sub-page scenarios by not changing any iomap or page state on sub-page invalidations. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 172b3397a1a3..5bd3cac4df9c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -221,7 +221,7 @@ struct iomap_writeback_ops { * Optional, allows the file system to discard state on a page where * we failed to submit any I/O. */ - void (*discard_page)(struct page *page); + void (*discard_page)(struct page *page, loff_t fileoff); }; struct iomap_writepage_ctx { -- cgit v1.2.3 From fdaf083cdfb556a45c422c8998268baf1ab26829 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 30 Oct 2020 09:37:30 -0600 Subject: io_uring: properly handle SQPOLL request cancelations Track if a given task io_uring context contains SQPOLL instances, so we can iterate those for cancelation (and request counts). This ensures that we properly wait on SQPOLL contexts, and find everything that needs canceling. Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 868364cea3b7..35b2d845704d 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -30,7 +30,8 @@ struct io_uring_task { struct percpu_counter inflight; struct io_identity __identity; struct io_identity *identity; - bool in_idle; + atomic_t in_idle; + bool sqpoll; }; #if defined(CONFIG_IO_URING) -- cgit v1.2.3 From d4d50710a8b46082224376ef119a4dbb75b25c56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Nov 2020 09:27:33 +0100 Subject: seq_file: add seq_read_iter iov_iter based variant for reading a seq_file. seq_read is reimplemented on top of the iter variant. Signed-off-by: Christoph Hellwig Tested-by: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- include/linux/seq_file.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 813614d4b71f..b83b3ae3c877 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -107,6 +107,7 @@ void seq_pad(struct seq_file *m, char c); char *mangle_path(char *s, const char *p, const char *esc); int seq_open(struct file *, const struct seq_operations *); ssize_t seq_read(struct file *, char __user *, size_t, loff_t *); +ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter); loff_t seq_lseek(struct file *, loff_t, int); int seq_release(struct inode *, struct file *); int seq_write(struct seq_file *seq, const void *data, size_t len); -- cgit v1.2.3 From ede7dc7fa0af619afc08995776eadb9ff3b0a711 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:54 -0800 Subject: jbd2: rename j_maxlen to j_total_len and add jbd2_journal_max_txn_bufs The on-disk superblock field sb->s_maxlen represents the total size of the journal including the fast commit area and is no more the max number of blocks available for a transaction. The maximum number of blocks available to a transaction is reduced by the number of fast commit blocks. So, this patch renames j_maxlen to j_total_len to better represent its intent. Also, it adds a function to calculate max number of bufs available for a transaction. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-6-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 1d5566af48ac..e0b6b53eae64 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -988,9 +988,9 @@ struct journal_s struct block_device *j_fs_dev; /** - * @j_maxlen: Total maximum capacity of the journal region on disk. + * @j_total_len: Total maximum capacity of the journal region on disk. */ - unsigned int j_maxlen; + unsigned int j_total_len; /** * @j_reserved_credits: @@ -1624,6 +1624,11 @@ int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_fc_wait_bufs(journal_t *journal, int num_blks); int jbd2_fc_release_bufs(journal_t *journal); +static inline int jbd2_journal_get_max_txn_bufs(journal_t *journal) +{ + return (journal->j_total_len - journal->j_fc_wbufsize) / 4; +} + /* * is_journal_abort * -- cgit v1.2.3 From a1e5e465b31d6015fccb359d99053b39e5180466 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:55 -0800 Subject: ext4: clean up the JBD2 API that initializes fast commits This patch removes jbd2_fc_init() API and its related functions to simplify enabling fast commits. With this change, the number of fast commit blocks to use is solely determined by the JBD2 layer. So, we move the default value for minimum number of fast commit blocks from ext4/fast_commit.h to include/linux/jbd2.h. However, whether or not to use fast commits is determined by the file system. The file system just sets the fast commit feature using jbd2_journal_set_features(). JBD2 layer then determines how many blocks to use for fast commits (based on the value found in the JBD2 superblock). Note that the JBD2 feature flag of fast commits is just an indication that there are fast commit blocks present on disk. It doesn't tell JBD2 layer about the intent of the file system of whether to it wants to use fast commit or not. That's why, we blindly clear the fast commit flag in journal_reset() after the recovery is done. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-7-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index e0b6b53eae64..b2caf7bbd8e5 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -68,6 +68,7 @@ extern void *jbd2_alloc(size_t size, gfp_t flags); extern void jbd2_free(void *ptr, size_t size); #define JBD2_MIN_JOURNAL_BLOCKS 1024 +#define JBD2_MIN_FC_BLOCKS 256 #ifdef __KERNEL__ @@ -1614,7 +1615,6 @@ extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); extern int jbd2_cleanup_journal_tail(journal_t *); /* Fast commit related APIs */ -int jbd2_fc_init(journal_t *journal, int num_fc_blks); int jbd2_fc_begin_commit(journal_t *journal, tid_t tid); int jbd2_fc_end_commit(journal_t *journal); int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid); -- cgit v1.2.3 From c460e5edc85a063ec9cb60addff93d00ed378701 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:57 -0800 Subject: jbd2: don't use state lock during commit path Variables journal->j_fc_off, journal->j_fc_wbuf are accessed during commit path. Since today we allow only one process to perform a fast commit, there is no need take state lock before accessing these variables. This patch removes these locks and adds comments to describe this. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-9-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index b2caf7bbd8e5..5f0ef6380b0c 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -945,8 +945,9 @@ struct journal_s /** * @j_fc_off: * - * Number of fast commit blocks currently allocated. - * [j_state_lock]. + * Number of fast commit blocks currently allocated. Accessed only + * during fast commit. Currently only process can do fast commit, so + * this field is not protected by any lock. */ unsigned long j_fc_off; @@ -1109,8 +1110,9 @@ struct journal_s struct buffer_head **j_wbuf; /** - * @j_fc_wbuf: Array of fast commit bhs for - * jbd2_journal_commit_transaction. + * @j_fc_wbuf: Array of fast commit bhs for fast commit. Accessed only + * during a fast commit. Currently only process can do fast commit, so + * this field is not protected by any lock. */ struct buffer_head **j_fc_wbuf; -- cgit v1.2.3 From 0bce577bf9cae13ae32d391432d0030e3f67fc1d Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:58 -0800 Subject: jbd2: don't pass tid to jbd2_fc_end_commit_fallback() In jbd2_fc_end_commit_fallback(), we know which tid to commit. There's no need for caller to pass it. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-10-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 5f0ef6380b0c..1c49fd62ff2e 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1619,7 +1619,7 @@ extern int jbd2_cleanup_journal_tail(journal_t *); /* Fast commit related APIs */ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid); int jbd2_fc_end_commit(journal_t *journal); -int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid); +int jbd2_fc_end_commit_fallback(journal_t *journal); int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out); int jbd2_submit_inode_data(struct jbd2_inode *jinode); int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); -- cgit v1.2.3 From 267fb27352b6fc9fdbad753127a239f75618ecbc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 30 Oct 2020 15:50:32 +0100 Subject: perf: Reduce stack usage of perf_output_begin() __perf_output_begin() has an on-stack struct perf_sample_data in the unlikely case it needs to generate a LOST record. However, every call to perf_output_begin() must already have a perf_sample_data on-stack. Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201030151954.985416146@infradead.org --- include/linux/perf_event.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0c19d279b97f..b775ae0a8c87 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1400,11 +1400,14 @@ perf_event_addr_filters(struct perf_event *event) extern void perf_event_addr_filters_sync(struct perf_event *event); extern int perf_output_begin(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_forward(struct perf_output_handle *handle, - struct perf_event *event, - unsigned int size); + struct perf_sample_data *data, + struct perf_event *event, + unsigned int size); extern int perf_output_begin_backward(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size); -- cgit v1.2.3 From 76a4efa80900fc40e0fdf243b42aec9fb8c35d24 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 30 Oct 2020 12:14:21 +0100 Subject: perf/arch: Remove perf_sample_data::regs_user_copy struct perf_sample_data lives on-stack, we should be careful about it's size. Furthermore, the pt_regs copy in there is only because x86_64 is a trainwreck, solve it differently. Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Steven Rostedt Link: https://lkml.kernel.org/r/20201030151955.258178461@infradead.org --- include/linux/perf_event.h | 6 ------ include/linux/perf_regs.h | 6 ++---- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b775ae0a8c87..96450f6fb1de 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1022,13 +1022,7 @@ struct perf_sample_data { struct perf_callchain_entry *callchain; u64 aux_size; - /* - * regs_user may point to task_pt_regs or to regs_user_copy, depending - * on arch details. - */ struct perf_regs regs_user; - struct pt_regs regs_user_copy; - struct perf_regs regs_intr; u64 stack_user_size; diff --git a/include/linux/perf_regs.h b/include/linux/perf_regs.h index 2d12e97d5e7b..f632c5725f16 100644 --- a/include/linux/perf_regs.h +++ b/include/linux/perf_regs.h @@ -20,8 +20,7 @@ u64 perf_reg_value(struct pt_regs *regs, int idx); int perf_reg_validate(u64 mask); u64 perf_reg_abi(struct task_struct *task); void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy); + struct pt_regs *regs); #else #define PERF_REG_EXTENDED_MASK 0 @@ -42,8 +41,7 @@ static inline u64 perf_reg_abi(struct task_struct *task) } static inline void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); -- cgit v1.2.3 From 9a2a9ebc0a758d887ee06e067e9f7f0b36ff7574 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 Nov 2020 18:25:57 +0100 Subject: cpufreq: Introduce governor flags A new cpufreq governor flag will be added subsequently, so replace the bool dynamic_switching fleid in struct cpufreq_governor with a flags field and introduce CPUFREQ_GOV_DYNAMIC_SWITCHING to set for the "dynamic switching" governors instead of it. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- include/linux/cpufreq.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 1eaa04f1bae6..9bdfcf3c4748 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -570,12 +570,17 @@ struct cpufreq_governor { char *buf); int (*store_setspeed) (struct cpufreq_policy *policy, unsigned int freq); - /* For governors which change frequency dynamically by themselves */ - bool dynamic_switching; struct list_head governor_list; struct module *owner; + u8 flags; }; +/* Governor flags */ + +/* For governors which change frequency dynamically by themselves */ +#define CPUFREQ_GOV_DYNAMIC_SWITCHING BIT(0) + + /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq); -- cgit v1.2.3 From 218f66870181bec7aaa6e3c72f346039c590c3c2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 Nov 2020 18:26:10 +0100 Subject: cpufreq: Introduce CPUFREQ_GOV_STRICT_TARGET Introduce a new governor flag, CPUFREQ_GOV_STRICT_TARGET, for the governors that want the target frequency to be set exactly to the given value without leaving any room for adjustments on the hardware side and set this flag for the powersave and performance governors. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- include/linux/cpufreq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 9bdfcf3c4748..6eb9a3b8ec7b 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -580,6 +580,9 @@ struct cpufreq_governor { /* For governors which change frequency dynamically by themselves */ #define CPUFREQ_GOV_DYNAMIC_SWITCHING BIT(0) +/* For governors wanting the target frequency to be set exactly */ +#define CPUFREQ_GOV_STRICT_TARGET BIT(1) + /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, -- cgit v1.2.3 From ea9364bbadf11f0c55802cf11387d74f524cee84 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 Nov 2020 18:26:37 +0100 Subject: cpufreq: Add strict_target to struct cpufreq_policy Add a new field to be set when the CPUFREQ_GOV_STRICT_TARGET flag is set for the current governor to struct cpufreq_policy, so that the drivers needing to check CPUFREQ_GOV_STRICT_TARGET do not have to access the governor object during every frequency transition. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- include/linux/cpufreq.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 6eb9a3b8ec7b..acbad3b36322 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -109,6 +109,12 @@ struct cpufreq_policy { bool fast_switch_possible; bool fast_switch_enabled; + /* + * Set if the CPUFREQ_GOV_STRICT_TARGET flag is set for the current + * governor. + */ + bool strict_target; + /* * Preferred average time interval between consecutive invocations of * the driver to set the frequency for this policy. To be set by the -- cgit v1.2.3 From 8a3c84b649b033024d2349f96234b26cbd6083a6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 10 Nov 2020 16:50:21 -0800 Subject: vfs: separate __sb_start_write into blocking and non-blocking helpers Break this function into two helpers so that it's obvious that the trylock versions return a value that must be checked, and the blocking versions don't require that. While we're at it, clean up the return type mismatch. Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig --- include/linux/fs.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0bd126418bb6..305989afd49c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1581,7 +1581,8 @@ extern struct timespec64 current_time(struct inode *inode); */ void __sb_end_write(struct super_block *sb, int level); -int __sb_start_write(struct super_block *sb, int level, bool wait); +void __sb_start_write(struct super_block *sb, int level); +bool __sb_start_write_trylock(struct super_block *sb, int level); #define __sb_writers_acquired(sb, lev) \ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) @@ -1645,12 +1646,12 @@ static inline void sb_end_intwrite(struct super_block *sb) */ static inline void sb_start_write(struct super_block *sb) { - __sb_start_write(sb, SB_FREEZE_WRITE, true); + __sb_start_write(sb, SB_FREEZE_WRITE); } -static inline int sb_start_write_trylock(struct super_block *sb) +static inline bool sb_start_write_trylock(struct super_block *sb) { - return __sb_start_write(sb, SB_FREEZE_WRITE, false); + return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); } /** @@ -1674,7 +1675,7 @@ static inline int sb_start_write_trylock(struct super_block *sb) */ static inline void sb_start_pagefault(struct super_block *sb) { - __sb_start_write(sb, SB_FREEZE_PAGEFAULT, true); + __sb_start_write(sb, SB_FREEZE_PAGEFAULT); } /* @@ -1692,12 +1693,12 @@ static inline void sb_start_pagefault(struct super_block *sb) */ static inline void sb_start_intwrite(struct super_block *sb) { - __sb_start_write(sb, SB_FREEZE_FS, true); + __sb_start_write(sb, SB_FREEZE_FS); } -static inline int sb_start_intwrite_trylock(struct super_block *sb) +static inline bool sb_start_intwrite_trylock(struct super_block *sb) { - return __sb_start_write(sb, SB_FREEZE_FS, false); + return __sb_start_write_trylock(sb, SB_FREEZE_FS); } @@ -2756,14 +2757,14 @@ static inline void file_start_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return; - __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true); + sb_start_write(file_inode(file)->i_sb); } static inline bool file_start_write_trylock(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; - return __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, false); + return sb_start_write_trylock(file_inode(file)->i_sb); } static inline void file_end_write(struct file *file) -- cgit v1.2.3 From 9b8523423b23ee3dfd88e32f5b7207be56a4e782 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 10 Nov 2020 16:50:21 -0800 Subject: vfs: move __sb_{start,end}_write* to fs.h Now that we've straightened out the callers, move these three functions to fs.h since they're fairly trivial. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara --- include/linux/fs.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 305989afd49c..6dabd019cab0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1580,9 +1580,24 @@ extern struct timespec64 current_time(struct inode *inode); * Snapshotting support. */ -void __sb_end_write(struct super_block *sb, int level); -void __sb_start_write(struct super_block *sb, int level); -bool __sb_start_write_trylock(struct super_block *sb, int level); +/* + * These are internal functions, please use sb_start_{write,pagefault,intwrite} + * instead. + */ +static inline void __sb_end_write(struct super_block *sb, int level) +{ + percpu_up_read(sb->s_writers.rw_sem + level-1); +} + +static inline void __sb_start_write(struct super_block *sb, int level) +{ + percpu_down_read(sb->s_writers.rw_sem + level - 1); +} + +static inline bool __sb_start_write_trylock(struct super_block *sb, int level) +{ + return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); +} #define __sb_writers_acquired(sb, lev) \ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) -- cgit v1.2.3 From 5e844cc37a5cbaa460e68f9a989d321d63088a89 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Wed, 11 Nov 2020 20:07:10 +0100 Subject: spi: Introduce device-managed SPI controller allocation SPI driver probing currently comprises two steps, whereas removal comprises only one step: spi_alloc_master() spi_register_controller() spi_unregister_controller() That's because spi_unregister_controller() calls device_unregister() instead of device_del(), thereby releasing the reference on the spi_controller which was obtained by spi_alloc_master(). An SPI driver's private data is contained in the same memory allocation as the spi_controller struct. Thus, once spi_unregister_controller() has been called, the private data is inaccessible. But some drivers need to access it after spi_unregister_controller() to perform further teardown steps. Introduce devm_spi_alloc_master() and devm_spi_alloc_slave(), which release a reference on the spi_controller struct only after the driver has unbound, thereby keeping the memory allocation accessible. Change spi_unregister_controller() to not release a reference if the spi_controller was allocated by one of these new devm functions. The present commit is small enough to be backportable to stable. It allows fixing drivers which use the private data in their ->remove() hook after it's been freed. It also allows fixing drivers which neglect to release a reference on the spi_controller in the probe error path. Long-term, most SPI drivers shall be moved over to the devm functions introduced herein. The few that can't shall be changed in a treewide commit to explicitly release the last reference on the controller. That commit shall amend spi_unregister_controller() to no longer release a reference, thereby completing the migration. As a result, the behaviour will be less surprising and more consistent with subsystems such as IIO, which also includes the private data in the allocation of the generic iio_dev struct, but calls device_del() in iio_device_unregister(). Signed-off-by: Lukas Wunner Link: https://lore.kernel.org/r/272bae2ef08abd21388c98e23729886663d19192.1605121038.git.lukas@wunner.de Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 99380c0825db..b390fdac1587 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -734,6 +734,25 @@ static inline struct spi_controller *spi_alloc_slave(struct device *host, return __spi_alloc_controller(host, size, true); } +struct spi_controller *__devm_spi_alloc_controller(struct device *dev, + unsigned int size, + bool slave); + +static inline struct spi_controller *devm_spi_alloc_master(struct device *dev, + unsigned int size) +{ + return __devm_spi_alloc_controller(dev, size, false); +} + +static inline struct spi_controller *devm_spi_alloc_slave(struct device *dev, + unsigned int size) +{ + if (!IS_ENABLED(CONFIG_SPI_SLAVE)) + return NULL; + + return __devm_spi_alloc_controller(dev, size, true); +} + extern int spi_register_controller(struct spi_controller *ctlr); extern int devm_spi_register_controller(struct device *dev, struct spi_controller *ctlr); -- cgit v1.2.3 From 7e890c37c25c7cbca37ff0ab292873d8146e713b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 12 Nov 2020 17:50:04 +0100 Subject: block: add a return value to set_capacity_revalidate_and_notify Return if the function ended up sending an uevent or not. Cc: stable@vger.kernel.org # v5.9 Signed-off-by: Christoph Hellwig Reviewed-by: Petr Vorel Signed-off-by: Jens Axboe --- include/linux/genhd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 38f23d757013..03da3f603d30 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -315,7 +315,7 @@ static inline int get_disk_ro(struct gendisk *disk) extern void disk_block_events(struct gendisk *disk); extern void disk_unblock_events(struct gendisk *disk); extern void disk_flush_events(struct gendisk *disk, unsigned int mask); -void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, +bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, bool update_bdev); /* drivers/char/random.c */ -- cgit v1.2.3 From 3347acc6fcd4ee71ad18a9ff9d9dac176b517329 Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Fri, 13 Nov 2020 22:51:59 -0800 Subject: compiler.h: fix barrier_data() on clang Commit 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h mutually exclusive") neglected to copy barrier_data() from compiler-gcc.h into compiler-clang.h. The definition in compiler-gcc.h was really to work around clang's more aggressive optimization, so this broke barrier_data() on clang, and consequently memzero_explicit() as well. For example, this results in at least the memzero_explicit() call in lib/crypto/sha256.c:sha256_transform() being optimized away by clang. Fix this by moving the definition of barrier_data() into compiler.h. Also move the gcc/clang definition of barrier() into compiler.h, __memory_barrier() is icc-specific (and barrier() is already defined using it in compiler-intel.h) and doesn't belong in compiler.h. [rdunlap@infradead.org: fix ALPHA builds when SMP is not enabled] Link: https://lkml.kernel.org/r/20201101231835.4589-1-rdunlap@infradead.org Fixes: 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h mutually exclusive") Signed-off-by: Arvind Sankar Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Tested-by: Nick Desaulniers Reviewed-by: Nick Desaulniers Reviewed-by: Kees Cook Cc: Link: https://lkml.kernel.org/r/20201014212631.207844-1-nivedita@alum.mit.edu Signed-off-by: Linus Torvalds --- include/linux/compiler-clang.h | 6 ------ include/linux/compiler-gcc.h | 19 ------------------- include/linux/compiler.h | 18 ++++++++++++++++-- 3 files changed, 16 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 230604e7f057..dd7233c48bf3 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -60,12 +60,6 @@ #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 #endif -/* The following are for compatibility with GCC, from compiler-gcc.h, - * and may be redefined here because they should not be shared with other - * compilers, like ICC. - */ -#define barrier() __asm__ __volatile__("" : : : "memory") - #if __has_feature(shadow_call_stack) # define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) #endif diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 5deb37024574..74c6c0486eed 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -15,25 +15,6 @@ # error Sorry, your version of GCC is too old - please use 4.9 or newer. #endif -/* Optimization barrier */ - -/* The "volatile" is due to gcc bugs */ -#define barrier() __asm__ __volatile__("": : :"memory") -/* - * This version is i.e. to prevent dead stores elimination on @ptr - * where gcc and llvm may behave differently when otherwise using - * normal barrier(): while gcc behavior gets along with a normal - * barrier(), llvm needs an explicit input variable to be assumed - * clobbered. The issue is as follows: while the inline asm might - * access any memory it wants, the compiler could have fit all of - * @ptr into memory registers instead, and since @ptr never escaped - * from that, it proved that the inline asm wasn't touching any of - * it. This version works well with both compilers, i.e. we're telling - * the compiler that the inline asm absolutely may see the contents - * of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495 - */ -#define barrier_data(ptr) __asm__ __volatile__("": :"r"(ptr) :"memory") - /* * This macro obfuscates arithmetic on a variable address so that gcc * shouldn't recognize the original var, and make assumptions about it. diff --git a/include/linux/compiler.h b/include/linux/compiler.h index e512f5505dad..b8fe0c23cfff 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -80,11 +80,25 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, /* Optimization barrier */ #ifndef barrier -# define barrier() __memory_barrier() +/* The "volatile" is due to gcc bugs */ +# define barrier() __asm__ __volatile__("": : :"memory") #endif #ifndef barrier_data -# define barrier_data(ptr) barrier() +/* + * This version is i.e. to prevent dead stores elimination on @ptr + * where gcc and llvm may behave differently when otherwise using + * normal barrier(): while gcc behavior gets along with a normal + * barrier(), llvm needs an explicit input variable to be assumed + * clobbered. The issue is as follows: while the inline asm might + * access any memory it wants, the compiler could have fit all of + * @ptr into memory registers instead, and since @ptr never escaped + * from that, it proved that the inline asm wasn't touching any of + * it. This version works well with both compilers, i.e. we're telling + * the compiler that the inline asm absolutely may see the contents + * of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495 + */ +# define barrier_data(ptr) __asm__ __volatile__("": :"r"(ptr) :"memory") #endif /* workaround for GCC PR82365 if needed */ -- cgit v1.2.3 From 8b21ca0218d29cc6bb7028125c7e5a10dfb4730c Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 13 Nov 2020 22:52:13 -0800 Subject: mm: memcontrol: fix missing wakeup polling thread When we poll the swap.events, we can miss being woken up when the swap event occurs. Because we didn't notify. Fixes: f3a53a3a1e5b ("mm, memcontrol: implement memory.swap.events") Signed-off-by: Muchun Song Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Roman Gushchin Cc: Michal Hocko Cc: Yafang Shao Cc: Chris Down Cc: Tejun Heo Link: https://lkml.kernel.org/r/20201105161936.98312-1-songmuchun@bytedance.com Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e391e3c56de5..a80c59af2c60 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -900,12 +900,19 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { + bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || + event == MEMCG_SWAP_FAIL; + atomic_long_inc(&memcg->memory_events_local[event]); - cgroup_file_notify(&memcg->events_local_file); + if (!swap_event) + cgroup_file_notify(&memcg->events_local_file); do { atomic_long_inc(&memcg->memory_events[event]); - cgroup_file_notify(&memcg->events_file); + if (swap_event) + cgroup_file_notify(&memcg->swap_events_file); + else + cgroup_file_notify(&memcg->events_file); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) break; -- cgit v1.2.3 From dd8088d5a8969dc2b42f71d7bc01c25c61a78066 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Tue, 10 Nov 2020 17:29:32 +0800 Subject: PM: runtime: Add pm_runtime_resume_and_get to deal with usage counter In many case, we need to check return value of pm_runtime_get_sync, but it brings a trouble to the usage counter processing. Many callers forget to decrease the usage counter when it failed, which could resulted in reference leak. It has been discussed a lot[0][1]. So we add a function to deal with the usage counter for better coding. [0]https://lkml.org/lkml/2020/6/14/88 [1]https://patchwork.ozlabs.org/project/linux-tegra/list/?series=178139 Signed-off-by: Zhang Qilong Acked-by: Rafael J. Wysocki Signed-off-by: Jakub Kicinski --- include/linux/pm_runtime.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 4b708f4e8eed..b492ae00cc90 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -386,6 +386,27 @@ static inline int pm_runtime_get_sync(struct device *dev) return __pm_runtime_resume(dev, RPM_GET_PUT); } +/** + * pm_runtime_resume_and_get - Bump up usage counter of a device and resume it. + * @dev: Target device. + * + * Resume @dev synchronously and if that is successful, increment its runtime + * PM usage counter. Return 0 if the runtime PM usage counter of @dev has been + * incremented or a negative error code otherwise. + */ +static inline int pm_runtime_resume_and_get(struct device *dev) +{ + int ret; + + ret = __pm_runtime_resume(dev, RPM_GET_PUT); + if (ret < 0) { + pm_runtime_put_noidle(dev); + return ret; + } + + return 0; +} + /** * pm_runtime_put - Drop device usage counter and queue up "idle check" if 0. * @dev: Target device. -- cgit v1.2.3 From f97bb5272d9e95d400d6c8643ebb146b3e3e7842 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Nov 2020 09:08:41 +0100 Subject: sched: Fix data-race in wakeup Mel reported that on some ARM64 platforms loadavg goes bananas and Will tracked it down to the following race: CPU0 CPU1 schedule() prev->sched_contributes_to_load = X; deactivate_task(prev); try_to_wake_up() if (p->on_rq &&) // false if (smp_load_acquire(&p->on_cpu) && // true ttwu_queue_wakelist()) p->sched_remote_wakeup = Y; smp_store_release(prev->on_cpu, 0); where both p->sched_contributes_to_load and p->sched_remote_wakeup are in the same word, and thus the stores X and Y race (and can clobber one another's data). Whereas prior to commit c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") the p->on_cpu handoff serialized access to p->sched_remote_wakeup (just as it still does with p->sched_contributes_to_load) that commit broke that by calling ttwu_queue_wakelist() with p->on_cpu != 0. However, due to p->XXX = X ttwu() schedule() if (p->on_rq && ...) // false smp_mb__after_spinlock() if (smp_load_acquire(&p->on_cpu) && deactivate_task() ttwu_queue_wakelist()) p->on_rq = 0; p->sched_remote_wakeup = Y; We can be sure any 'current' store is complete and 'current' is guaranteed asleep. Therefore we can move p->sched_remote_wakeup into the current flags word. Note: while the observed failure was loadavg accounting gone wrong due to ttwu() cobbering p->sched_contributes_to_load, the reverse problem is also possible where schedule() clobbers p->sched_remote_wakeup, this could result in enqueue_entity() wrecking ->vruntime and causing scheduling artifacts. Fixes: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") Reported-by: Mel Gorman Debugged-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201117083016.GK3121392@hirez.programming.kicks-ass.net --- include/linux/sched.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index d383cf09e78f..0e91b451d2a2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -769,7 +769,6 @@ struct task_struct { unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; - unsigned sched_remote_wakeup:1; #ifdef CONFIG_PSI unsigned sched_psi_wake_requeue:1; #endif @@ -779,6 +778,21 @@ struct task_struct { /* Unserialized, strictly 'current' */ + /* + * This field must not be in the scheduler word above due to wakelist + * queueing no longer being serialized by p->on_cpu. However: + * + * p->XXX = X; ttwu() + * schedule() if (p->on_rq && ..) // false + * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true + * deactivate_task() ttwu_queue_wakelist()) + * p->on_rq = 0; p->sched_remote_wakeup = Y; + * + * guarantees all stores of 'current' are visible before + * ->sched_remote_wakeup gets used, so it can be in this word. + */ + unsigned sched_remote_wakeup:1; + /* Bit to tell LSMs we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; -- cgit v1.2.3 From 2279f540ea7d05f22d2f0c4224319330228586bc Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 17 Nov 2020 07:14:32 +0100 Subject: sched/deadline: Fix priority inheritance with multiple scheduling classes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Glenn reported that "an application [he developed produces] a BUG in deadline.c when a SCHED_DEADLINE task contends with CFS tasks on nested PTHREAD_PRIO_INHERIT mutexes. I believe the bug is triggered when a CFS task that was boosted by a SCHED_DEADLINE task boosts another CFS task (nested priority inheritance). ------------[ cut here ]------------ kernel BUG at kernel/sched/deadline.c:1462! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 12 PID: 19171 Comm: dl_boost_bug Tainted: ... Hardware name: ... RIP: 0010:enqueue_task_dl+0x335/0x910 Code: ... RSP: 0018:ffffc9000c2bbc68 EFLAGS: 00010002 RAX: 0000000000000009 RBX: ffff888c0af94c00 RCX: ffffffff81e12500 RDX: 000000000000002e RSI: ffff888c0af94c00 RDI: ffff888c10b22600 RBP: ffffc9000c2bbd08 R08: 0000000000000009 R09: 0000000000000078 R10: ffffffff81e12440 R11: ffffffff81e1236c R12: ffff888bc8932600 R13: ffff888c0af94eb8 R14: ffff888c10b22600 R15: ffff888bc8932600 FS: 00007fa58ac55700(0000) GS:ffff888c10b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa58b523230 CR3: 0000000bf44ab003 CR4: 00000000007606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? intel_pstate_update_util_hwp+0x13/0x170 rt_mutex_setprio+0x1cc/0x4b0 task_blocks_on_rt_mutex+0x225/0x260 rt_spin_lock_slowlock_locked+0xab/0x2d0 rt_spin_lock_slowlock+0x50/0x80 hrtimer_grab_expiry_lock+0x20/0x30 hrtimer_cancel+0x13/0x30 do_nanosleep+0xa0/0x150 hrtimer_nanosleep+0xe1/0x230 ? __hrtimer_init_sleeper+0x60/0x60 __x64_sys_nanosleep+0x8d/0xa0 do_syscall_64+0x4a/0x100 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7fa58b52330d ... ---[ end trace 0000000000000002 ]— He also provided a simple reproducer creating the situation below: So the execution order of locking steps are the following (N1 and N2 are non-deadline tasks. D1 is a deadline task. M1 and M2 are mutexes that are enabled * with priority inheritance.) Time moves forward as this timeline goes down: N1 N2 D1 | | | | | | Lock(M1) | | | | | | Lock(M2) | | | | | | Lock(M2) | | | | Lock(M1) | | (!!bug triggered!) | Daniel reported a similar situation as well, by just letting ksoftirqd run with DEADLINE (and eventually block on a mutex). Problem is that boosted entities (Priority Inheritance) use static DEADLINE parameters of the top priority waiter. However, there might be cases where top waiter could be a non-DEADLINE entity that is currently boosted by a DEADLINE entity from a different lock chain (i.e., nested priority chains involving entities of non-DEADLINE classes). In this case, top waiter static DEADLINE parameters could be null (initialized to 0 at fork()) and replenish_dl_entity() would hit a BUG(). Fix this by keeping track of the original donor and using its parameters when a task is boosted. Reported-by: Glenn Elliott Reported-by: Daniel Bristot de Oliveira Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Tested-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201117061432.517340-1-juri.lelli@redhat.com --- include/linux/sched.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0e91b451d2a2..095fdec07b38 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -551,7 +551,6 @@ struct sched_dl_entity { * overruns. */ unsigned int dl_throttled : 1; - unsigned int dl_boosted : 1; unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; @@ -570,6 +569,15 @@ struct sched_dl_entity { * time. */ struct hrtimer inactive_timer; + +#ifdef CONFIG_RT_MUTEXES + /* + * Priority Inheritance. When a DEADLINE scheduling entity is boosted + * pi_se points to the donor, otherwise points to the dl_se it belongs + * to (the original one/itself). + */ + struct sched_dl_entity *pi_se; +#endif }; #ifdef CONFIG_UCLAMP_TASK -- cgit v1.2.3 From 4d213e76a359e540ca786ee937da7f35faa8e5f8 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Tue, 10 Nov 2020 15:19:08 +0800 Subject: iommu/vt-d: Avoid panic if iommu init fails in tboot system "intel_iommu=off" command line is used to disable iommu but iommu is force enabled in a tboot system for security reason. However for better performance on high speed network device, a new option "intel_iommu=tboot_noforce" is introduced to disable the force on. By default kernel should panic if iommu init fail in tboot for security reason, but it's unnecessory if we use "intel_iommu=tboot_noforce,off". Fix the code setting force_on and move intel_iommu_tboot_noforce from tboot code to intel iommu code. Fixes: 7304e8f28bb2 ("iommu/vt-d: Correctly disable Intel IOMMU force on") Signed-off-by: Zhenzhong Duan Tested-by: Lukasz Hawrylko Acked-by: Lu Baolu Link: https://lore.kernel.org/r/20201110071908.3133-1-zhenzhong.duan@gmail.com Signed-off-by: Will Deacon --- include/linux/intel-iommu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index fbf5b3e7707e..d956987ed032 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -798,7 +798,6 @@ extern int iommu_calculate_agaw(struct intel_iommu *iommu); extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu); extern int dmar_disabled; extern int intel_iommu_enabled; -extern int intel_iommu_tboot_noforce; extern int intel_iommu_gfx_mapped; #else static inline int iommu_calculate_agaw(struct intel_iommu *iommu) -- cgit v1.2.3 From 2bf31d94423c8ae3ff58e38a115b177df6940399 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 16 Nov 2020 11:18:08 +0100 Subject: jbd2: fix kernel-doc markups Kernel-doc markup should use this format: identifier - description They should not have any type before that, as otherwise the parser won't do the right thing. Also, some identifiers have different names between their prototypes and the kernel-doc markup. Reviewed-by: Jan Kara Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/72f5c6628f5f278d67625f60893ffbc2ca28d46e.1605521731.git.mchehab+huawei@kernel.org Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 1c49fd62ff2e..578ff196b3ce 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -401,7 +401,7 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) #define JI_WAIT_DATA (1 << __JI_WAIT_DATA) /** - * struct jbd_inode - The jbd_inode type is the structure linking inodes in + * struct jbd2_inode - The jbd_inode type is the structure linking inodes in * ordered mode present in a transaction so that we can sync them during commit. */ struct jbd2_inode { -- cgit v1.2.3 From bc2dc4406c463174613047d8b7946e12c8808cda Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Sat, 21 Nov 2020 22:17:01 -0800 Subject: compiler-clang: remove version check for BPF Tracing bpftrace parses the kernel headers and uses Clang under the hood. Remove the version check when __BPF_TRACING__ is defined (as bpftrace does) so that this tool can continue to parse kernel headers, even with older clang sources. Fixes: commit 1f7a44f63e6c ("compiler-clang: add build check for clang 10.0.1") Reported-by: Chen Yu Reported-by: Jarkko Sakkinen Signed-off-by: Nick Desaulniers Signed-off-by: Andrew Morton Tested-by: Jarkko Sakkinen Acked-by: Jarkko Sakkinen Acked-by: Song Liu Acked-by: Nathan Chancellor Acked-by: Miguel Ojeda Link: https://lkml.kernel.org/r/20201104191052.390657-1-ndesaulniers@google.com Signed-off-by: Linus Torvalds --- include/linux/compiler-clang.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index dd7233c48bf3..98cff1b4b088 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -8,8 +8,10 @@ + __clang_patchlevel__) #if CLANG_VERSION < 100001 +#ifndef __BPF_TRACING__ # error Sorry, your version of Clang is too old - please use 10.0.1 or newer. #endif +#endif /* Compiler specific definitions for Clang compiler */ -- cgit v1.2.3 From a927bd6ba952d13c52b8b385030943032f659a3e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 21 Nov 2020 22:17:05 -0800 Subject: mm: fix phys_to_target_node() and memory_add_physaddr_to_nid() exports The core-mm has a default __weak implementation of phys_to_target_node() to mirror the weak definition of memory_add_physaddr_to_nid(). That symbol is exported for modules. However, while the export in mm/memory_hotplug.c exported the symbol in the configuration cases of: CONFIG_NUMA_KEEP_MEMINFO=y CONFIG_MEMORY_HOTPLUG=y ...and: CONFIG_NUMA_KEEP_MEMINFO=n CONFIG_MEMORY_HOTPLUG=y ...it failed to export the symbol in the case of: CONFIG_NUMA_KEEP_MEMINFO=y CONFIG_MEMORY_HOTPLUG=n Not only is that broken, but Christoph points out that the kernel should not be exporting any __weak symbol, which means that memory_add_physaddr_to_nid() example that phys_to_target_node() copied is broken too. Rework the definition of phys_to_target_node() and memory_add_physaddr_to_nid() to not require weak symbols. Move to the common arch override design-pattern of an asm header defining a symbol to replace the default implementation. The only common header that all memory_add_physaddr_to_nid() producing architectures implement is asm/sparsemem.h. In fact, powerpc already defines its memory_add_physaddr_to_nid() helper in sparsemem.h. Double-down on that observation and define phys_to_target_node() where necessary in asm/sparsemem.h. An alternate consideration that was discarded was to put this override in asm/numa.h, but that entangles with the definition of MAX_NUMNODES relative to the inclusion of linux/nodemask.h, and requires powerpc to grow a new header. The dependency on NUMA_KEEP_MEMINFO for DEV_DAX_HMEM_DEVICES is invalid now that the symbol is properly exported / stubbed in all combinations of CONFIG_NUMA_KEEP_MEMINFO and CONFIG_MEMORY_HOTPLUG. [dan.j.williams@intel.com: v4] Link: https://lkml.kernel.org/r/160461461867.1505359.5301571728749534585.stgit@dwillia2-desk3.amr.corp.intel.com [dan.j.williams@intel.com: powerpc: fix create_section_mapping compile warning] Link: https://lkml.kernel.org/r/160558386174.2948926.2740149041249041764.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: a035b6bf863e ("mm/memory_hotplug: introduce default phys_to_target_node() implementation") Reported-by: Randy Dunlap Reported-by: Thomas Gleixner Reported-by: kernel test robot Reported-by: Christoph Hellwig Signed-off-by: Dan Williams Signed-off-by: Andrew Morton Tested-by: Randy Dunlap Tested-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Cc: Joao Martins Cc: Tony Luck Cc: Fenghua Yu Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Vishal Verma Cc: Stephen Rothwell Link: https://lkml.kernel.org/r/160447639846.1133764.7044090803980177548.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 14 -------------- include/linux/numa.h | 30 +++++++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index d65c6fdc5cfc..551093b74596 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -281,20 +281,6 @@ static inline bool movable_node_is_enabled(void) } #endif /* ! CONFIG_MEMORY_HOTPLUG */ -#ifdef CONFIG_NUMA -extern int memory_add_physaddr_to_nid(u64 start); -extern int phys_to_target_node(u64 start); -#else -static inline int memory_add_physaddr_to_nid(u64 start) -{ - return 0; -} -static inline int phys_to_target_node(u64 start) -{ - return 0; -} -#endif - #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) /* * pgdat resizing functions diff --git a/include/linux/numa.h b/include/linux/numa.h index 8cb33ccfb671..cb44cfe2b725 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -21,13 +21,41 @@ #endif #ifdef CONFIG_NUMA +#include +#include + /* Generic implementation available */ int numa_map_to_online_node(int node); -#else + +#ifndef memory_add_physaddr_to_nid +static inline int memory_add_physaddr_to_nid(u64 start) +{ + pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +#endif +#ifndef phys_to_target_node +static inline int phys_to_target_node(u64 start) +{ + pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +#endif +#else /* !CONFIG_NUMA */ static inline int numa_map_to_online_node(int node) { return NUMA_NO_NODE; } +static inline int memory_add_physaddr_to_nid(u64 start) +{ + return 0; +} +static inline int phys_to_target_node(u64 start) +{ + return 0; +} #endif #endif /* _LINUX_NUMA_H */ -- cgit v1.2.3 From 4349a83a3190c1d4414371161b0f4a4c3ccd3f9d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 21 Nov 2020 22:17:08 -0800 Subject: mm: fix readahead_page_batch for retry entries Both btrfs and fuse have reported faults caused by seeing a retry entry instead of the page they were looking for. This was caused by a missing check in the iterator. As can be seen in the below panic log, the accessing 0x402 causes a panic. In the xarray.h, 0x402 means RETRY_ENTRY. BUG: kernel NULL pointer dereference, address: 0000000000000402 CPU: 14 PID: 306003 Comm: as Not tainted 5.9.0-1-amd64 #1 Debian 5.9.1-1 Hardware name: Lenovo ThinkSystem SR665/7D2VCTO1WW, BIOS D8E106Q-1.01 05/30/2020 RIP: 0010:fuse_readahead+0x152/0x470 [fuse] Code: 41 8b 57 18 4c 8d 54 10 ff 4c 89 d6 48 8d 7c 24 10 e8 d2 e3 28 f9 48 85 c0 0f 84 fe 00 00 00 44 89 f2 49 89 04 d4 44 8d 72 01 <48> 8b 10 41 8b 4f 1c 48 c1 ea 10 83 e2 01 80 fa 01 19 d2 81 e2 01 RSP: 0018:ffffad99ceaebc50 EFLAGS: 00010246 RAX: 0000000000000402 RBX: 0000000000000001 RCX: 0000000000000002 RDX: 0000000000000000 RSI: ffff94c5af90bd98 RDI: ffffad99ceaebc60 RBP: ffff94ddc1749a00 R08: 0000000000000402 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000100 R12: ffff94de6c429ce0 R13: ffff94de6c4d3700 R14: 0000000000000001 R15: ffffad99ceaebd68 FS: 00007f228c5c7040(0000) GS:ffff94de8ed80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000402 CR3: 0000001dbd9b4000 CR4: 0000000000350ee0 Call Trace: read_pages+0x83/0x270 page_cache_readahead_unbounded+0x197/0x230 generic_file_buffered_read+0x57a/0xa20 new_sync_read+0x112/0x1a0 vfs_read+0xf8/0x180 ksys_read+0x5f/0xe0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 042124cc64c3 ("mm: add new readahead_control API") Reported-by: David Sterba Reported-by: Wonhyuk Yang Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Link: https://lkml.kernel.org/r/20201103142852.8543-1-willy@infradead.org Link: https://lkml.kernel.org/r/20201103124349.16722-1-vvghjk1234@gmail.com Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e1e19c1f9ec9..d5570deff400 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -906,6 +906,8 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, xas_set(&xas, rac->_index); rcu_read_lock(); xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) { + if (xas_retry(&xas, page)) + continue; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); array[i++] = page; -- cgit v1.2.3 From 4df910620bebb5cfe234af16ac8f6474b60215fd Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 25 Nov 2020 13:22:21 +0800 Subject: mm: memcg: relayout structure mem_cgroup to avoid cache interference 0day reported one -22.7% regression for will-it-scale page_fault2 case [1] on a 4 sockets 144 CPU platform, and bisected to it to be caused by Waiman's optimization (commit bd0b230fe1) of saving one 'struct page_counter' space for 'struct mem_cgroup'. Initially we thought it was due to the cache alignment change introduced by the patch, but further debug shows that it is due to some hot data members ('vmstats_local', 'vmstats_percpu', 'vmstats') sit in 2 adjacent cacheline (2N and 2N+1 cacheline), and when adjacent cache line prefetch is enabled, it triggers an "extended level" of cache false sharing for 2 adjacent cache lines. So exchange the 2 member blocks, while keeping mostly the original cache alignment, which can restore and even enhance the performance, and save 64 bytes of space for 'struct mem_cgroup' (from 2880 to 2816, with 0day's default RHEL-8.3 kernel config) [1]. https://lore.kernel.org/lkml/20201102091543.GM31092@shao2-debian/ Fixes: bd0b230fe145 ("mm/memcg: unify swap and memsw page counters") Reported-by: kernel test robot Signed-off-by: Feng Tang Acked-by: Waiman Long Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a80c59af2c60..922a7f600465 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -282,20 +282,6 @@ struct mem_cgroup { MEMCG_PADDING(_pad1_); - /* - * set > 0 if pages under this cgroup are moving to other cgroup. - */ - atomic_t moving_account; - struct task_struct *move_lock_task; - - /* Legacy local VM stats and events */ - struct memcg_vmstats_percpu __percpu *vmstats_local; - - /* Subtree VM stats and events (batched updates) */ - struct memcg_vmstats_percpu __percpu *vmstats_percpu; - - MEMCG_PADDING(_pad2_); - atomic_long_t vmstats[MEMCG_NR_STAT]; atomic_long_t vmevents[NR_VM_EVENT_ITEMS]; @@ -317,6 +303,20 @@ struct mem_cgroup { struct list_head objcg_list; /* list of inherited objcgs */ #endif + MEMCG_PADDING(_pad2_); + + /* + * set > 0 if pages under this cgroup are moving to other cgroup. + */ + atomic_t moving_account; + struct task_struct *move_lock_task; + + /* Legacy local VM stats and events */ + struct memcg_vmstats_percpu __percpu *vmstats_local; + + /* Subtree VM stats and events (batched updates) */ + struct memcg_vmstats_percpu __percpu *vmstats_percpu; + #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; struct wb_domain cgwb_domain; -- cgit v1.2.3