From 4c05ec47384ab3627b62814e8f886e90cc38ce15 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 26 Nov 2018 20:03:30 +0900 Subject: netfilter: nf_tables: fix suspicious RCU usage in nft_chain_stats_replace() basechain->stats is rcu protected data which is updated from nft_chain_stats_replace(). This function is executed from the commit phase which holds the pernet nf_tables commit mutex - not the global nfnetlink subsystem mutex. Test commands to reproduce the problem are: %iptables-nft -I INPUT %iptables-nft -Z %iptables-nft -Z This patch uses RCU calls to handle basechain->stats updates to fix a splat that looks like: [89279.358755] ============================= [89279.363656] WARNING: suspicious RCU usage [89279.368458] 4.20.0-rc2+ #44 Tainted: G W L [89279.374661] ----------------------------- [89279.379542] net/netfilter/nf_tables_api.c:1404 suspicious rcu_dereference_protected() usage! [...] [89279.406556] 1 lock held by iptables-nft/5225: [89279.411728] #0: 00000000bf45a000 (&net->nft.commit_mutex){+.+.}, at: nf_tables_valid_genid+0x1f/0x70 [nf_tables] [89279.424022] stack backtrace: [89279.429236] CPU: 0 PID: 5225 Comm: iptables-nft Tainted: G W L 4.20.0-rc2+ #44 [89279.430135] Call Trace: [89279.430135] dump_stack+0xc9/0x16b [89279.430135] ? show_regs_print_info+0x5/0x5 [89279.430135] ? lockdep_rcu_suspicious+0x117/0x160 [89279.430135] nft_chain_commit_update+0x4ea/0x640 [nf_tables] [89279.430135] ? sched_clock_local+0xd4/0x140 [89279.430135] ? check_flags.part.35+0x440/0x440 [89279.430135] ? __rhashtable_remove_fast.constprop.67+0xec0/0xec0 [nf_tables] [89279.430135] ? sched_clock_cpu+0x126/0x170 [89279.430135] ? find_held_lock+0x39/0x1c0 [89279.430135] ? hlock_class+0x140/0x140 [89279.430135] ? is_bpf_text_address+0x5/0xf0 [89279.430135] ? check_flags.part.35+0x440/0x440 [89279.430135] ? __lock_is_held+0xb4/0x140 [89279.430135] nf_tables_commit+0x2555/0x39c0 [nf_tables] Fixes: f102d66b335a4 ("netfilter: nf_tables: use dedicated mutex to guard transactions") Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 4a520d3304a2..cf09ab37b45b 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -62,18 +62,6 @@ static inline bool lockdep_nfnl_is_held(__u8 subsys_id) } #endif /* CONFIG_PROVE_LOCKING */ -/* - * nfnl_dereference - fetch RCU pointer when updates are prevented by subsys mutex - * - * @p: The pointer to read, prior to dereferencing - * @ss: The nfnetlink subsystem ID - * - * Return the value of the specified RCU-protected pointer, but omit - * the READ_ONCE(), because caller holds the NFNL subsystem mutex. - */ -#define nfnl_dereference(p, ss) \ - rcu_dereference_protected(p, lockdep_nfnl_is_held(ss)) - #define MODULE_ALIAS_NFNL_SUBSYS(subsys) \ MODULE_ALIAS("nfnetlink-subsys-" __stringify(subsys)) -- cgit v1.2.3 From 55f3f7eab75c10d9b33d122670b5935ab64db50f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 26 Nov 2018 16:08:43 -0500 Subject: XArray: Add xa_cmpxchg_irq and xa_cmpxchg_bh These convenience wrappers match the other _irq and _bh wrappers we already have. It turns out I'd already open-coded xa_cmpxchg_irq() in the shmem code, so convert that. Signed-off-by: Matthew Wilcox --- include/linux/xarray.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'include/linux') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 564892e19f8c..f492e21c4aa2 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -553,6 +553,60 @@ static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index, return curr; } +/** + * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @old: Old value to test against. + * @entry: New value to place in array. + * @gfp: Memory allocation flags. + * + * This function is like calling xa_cmpxchg() except it disables softirqs + * while holding the array lock. + * + * Context: Any context. Takes and releases the xa_lock while + * disabling softirqs. May sleep if the @gfp flags permit. + * Return: The old value at this index or xa_err() if an error happened. + */ +static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index, + void *old, void *entry, gfp_t gfp) +{ + void *curr; + + xa_lock_bh(xa); + curr = __xa_cmpxchg(xa, index, old, entry, gfp); + xa_unlock_bh(xa); + + return curr; +} + +/** + * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @old: Old value to test against. + * @entry: New value to place in array. + * @gfp: Memory allocation flags. + * + * This function is like calling xa_cmpxchg() except it disables interrupts + * while holding the array lock. + * + * Context: Process context. Takes and releases the xa_lock while + * disabling interrupts. May sleep if the @gfp flags permit. + * Return: The old value at this index or xa_err() if an error happened. + */ +static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index, + void *old, void *entry, gfp_t gfp) +{ + void *curr; + + xa_lock_irq(xa); + curr = __xa_cmpxchg(xa, index, old, entry, gfp); + xa_unlock_irq(xa); + + return curr; +} + /** * xa_insert() - Store this entry in the XArray unless another entry is * already present. -- cgit v1.2.3 From 60a89a3ce0cce515dc663bc1b45ac89202ad6c79 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 4 Dec 2018 20:58:33 -0500 Subject: scsi: t10-pi: Return correct ref tag when queue has no integrity profile Commit ddd0bc756983 ("block: move ref_tag calculation func to the block layer") moved ref tag calculation from SCSI to a library function. However, this change broke returning the correct ref tag for devices operating in DIF mode since these do not have an associated block integrity profile. This in turn caused read/write failures on PI-formatted disks attached to an mpt3sas controller. Fixes: ddd0bc756983 ("block: move ref_tag calculation func to the block layer") Cc: stable@vger.kernel.org # 4.19+ Reported-by: John Garry Tested-by: Xiang Chen Signed-off-by: Martin K. Petersen --- include/linux/t10-pi.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index b9626aa7e90c..3e2a80cc7b56 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -39,12 +39,13 @@ struct t10_pi_tuple { static inline u32 t10_pi_ref_tag(struct request *rq) { + unsigned int shift = ilog2(queue_logical_block_size(rq->q)); + #ifdef CONFIG_BLK_DEV_INTEGRITY - return blk_rq_pos(rq) >> - (rq->q->integrity.interval_exp - 9) & 0xffffffff; -#else - return -1U; + if (rq->q->integrity.interval_exp) + shift = rq->q->integrity.interval_exp; #endif + return blk_rq_pos(rq) >> (shift - SECTOR_SHIFT) & 0xffffffff; } extern const struct blk_integrity_profile t10_pi_type1_crc; -- cgit v1.2.3 From fdadd04931c2d7cd294dc5b2b342863f94be53a3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 11 Dec 2018 12:14:12 +0100 Subject: bpf: fix bpf_jit_limit knob for PAGE_SIZE >= 64K Michael and Sandipan report: Commit ede95a63b5 introduced a bpf_jit_limit tuneable to limit BPF JIT allocations. At compile time it defaults to PAGE_SIZE * 40000, and is adjusted again at init time if MODULES_VADDR is defined. For ppc64 kernels, MODULES_VADDR isn't defined, so we're stuck with the compile-time default at boot-time, which is 0x9c400000 when using 64K page size. This overflows the signed 32-bit bpf_jit_limit value: root@ubuntu:/tmp# cat /proc/sys/net/core/bpf_jit_limit -1673527296 and can cause various unexpected failures throughout the network stack. In one case `strace dhclient eth0` reported: setsockopt(5, SOL_SOCKET, SO_ATTACH_FILTER, {len=11, filter=0x105dd27f8}, 16) = -1 ENOTSUPP (Unknown error 524) and similar failures can be seen with tools like tcpdump. This doesn't always reproduce however, and I'm not sure why. The more consistent failure I've seen is an Ubuntu 18.04 KVM guest booted on a POWER9 host would time out on systemd/netplan configuring a virtio-net NIC with no noticeable errors in the logs. Given this and also given that in near future some architectures like arm64 will have a custom area for BPF JIT image allocations we should get rid of the BPF_JIT_LIMIT_DEFAULT fallback / default entirely. For 4.21, we have an overridable bpf_jit_alloc_exec(), bpf_jit_free_exec() so therefore add another overridable bpf_jit_alloc_exec_limit() helper function which returns the possible size of the memory area for deriving the default heuristic in bpf_jit_charge_init(). Like bpf_jit_alloc_exec() and bpf_jit_free_exec(), the new bpf_jit_alloc_exec_limit() assumes that module_alloc() is the default JIT memory provider, and therefore in case archs implement their custom module_alloc() we use MODULES_{END,_VADDR} for limits and otherwise for vmalloc_exec() cases like on ppc64 we use VMALLOC_{END,_START}. Additionally, for archs supporting large page sizes, we should change the sysctl to be handled as long to not run into sysctl restrictions in future. Fixes: ede95a63b5e8 ("bpf: add bpf_jit_limit knob to restrict unpriv allocations") Reported-by: Sandipan Das Reported-by: Michael Roth Signed-off-by: Daniel Borkmann Tested-by: Michael Roth Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 795ff0b869bb..a8b9d90a8042 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -861,7 +861,7 @@ bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, extern int bpf_jit_enable; extern int bpf_jit_harden; extern int bpf_jit_kallsyms; -extern int bpf_jit_limit; +extern long bpf_jit_limit; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); -- cgit v1.2.3 From 663f146f2ecfcc47934cb9f9543f664eeb6adb46 Mon Sep 17 00:00:00 2001 From: Vu Pham Date: Wed, 31 Oct 2018 16:03:21 +0200 Subject: net/mlx5: E-Switch, Fix fdb cap bits swap The cap bits locations for the fdb caps of multi path to table (used for local mirroring) and multi encap (used for prio/chains) were wrongly used in swapped locations. This went unnoted so far b/c we tested the offending patch with CX5 FW that supports both of them. On different environments where not both caps are supported, we will be messed up, fix that. Fixes: b9aa0ba17af5 ('net/mlx5: Add cap bits for multi fdb encap') Signed-off-by: Vu Pham Reviewed-by: Or Gerlitz Tested-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 34e17e6f8942..4e77bfe0b580 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -582,11 +582,13 @@ struct mlx5_ifc_flow_table_nic_cap_bits { }; struct mlx5_ifc_flow_table_eswitch_cap_bits { - u8 reserved_at_0[0x1c]; - u8 fdb_multi_path_to_table[0x1]; - u8 reserved_at_1d[0x1]; + u8 reserved_at_0[0x1a]; u8 multi_fdb_encap[0x1]; - u8 reserved_at_1e[0x1e1]; + u8 reserved_at_1b[0x1]; + u8 fdb_multi_path_to_table[0x1]; + u8 reserved_at_1d[0x3]; + + u8 reserved_at_20[0x1e0]; struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_esw_fdb; -- cgit v1.2.3 From d1402fc708e4c355813e49df6d15bc3466ba5114 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 14 Dec 2018 14:16:53 -0800 Subject: mm: introduce common STRUCT_PAGE_MAX_SHIFT define This define is used by arm64 to calculate the size of the vmemmap region. It is defined as the log2 of the upper bound on the size of a struct page. We move it into mm_types.h so it can be defined properly instead of set and checked with a build bug. This also allows us to use the same define for riscv. Link: http://lkml.kernel.org/r/20181107205433.3875-2-logang@deltatee.com Signed-off-by: Logan Gunthorpe Acked-by: Will Deacon Acked-by: Andrew Morton Acked-by: Ard Biesheuvel Acked-by: Catalin Marinas Cc: Arnd Bergmann Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5ed8f6292a53..2c471a2c43fa 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -206,6 +206,11 @@ struct page { #endif } _struct_page_alignment; +/* + * Used for sizing the vmemmap region on some architectures + */ +#define STRUCT_PAGE_MAX_SHIFT (order_base_2(sizeof(struct page))) + #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) -- cgit v1.2.3 From 9def36e0fa9a0d9c5393c039db59f1f2d3a388b3 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 14 Dec 2018 14:16:57 -0800 Subject: mm/sparse: add common helper to mark all memblocks present Presently the arches arm64, arm and sh have a function which loops through each memblock and calls memory present. riscv will require a similar function. Introduce a common memblocks_present() function that can be used by all the arches. Subsequent patches will cleanup the arches that make use of this. Link: http://lkml.kernel.org/r/20181107205433.3875-3-logang@deltatee.com Signed-off-by: Logan Gunthorpe Acked-by: Andrew Morton Cc: Michal Hocko Cc: Vlastimil Babka Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 847705a6d0ec..db023a92f3a4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -783,6 +783,12 @@ void memory_present(int nid, unsigned long start, unsigned long end); static inline void memory_present(int nid, unsigned long start, unsigned long end) {} #endif +#if defined(CONFIG_SPARSEMEM) +void memblocks_present(void); +#else +static inline void memblocks_present(void) {} +#endif + #ifdef CONFIG_HAVE_MEMORYLESS_NODES int local_memory_node(int node_id); #else -- cgit v1.2.3 From 15c6d8e565943a904172de69cc13c53f724fa16c Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Thu, 13 Dec 2018 15:00:11 -0500 Subject: mod_devicetable.h: correct kerneldoc typo, "PHYSID2" -> "MII_PHYSID2" Signed-off-by: Robert P. J. Day Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mod_devicetable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 01797cb4587e..a0dcc9b6a723 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -565,7 +565,7 @@ struct platform_device_id { /** * struct mdio_device_id - identifies PHY devices on an MDIO/MII bus * @phy_id: The result of - * (mdio_read(&MII_PHYSID1) << 16 | mdio_read(&PHYSID2)) & @phy_id_mask + * (mdio_read(&MII_PHYSID1) << 16 | mdio_read(&MII_PHYSID2)) & @phy_id_mask * for this PHY type * @phy_id_mask: Defines the significant bits of @phy_id. A value of 0 * is used to terminate an array of struct mdio_device_id. -- cgit v1.2.3