From e5e35e754c28724d5c619f2ec805fd221f8d59ce Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 8 Mar 2021 15:59:18 +0100 Subject: bpf: BPF-helper for MTU checking add length input The FIB lookup example[1] show how the IP-header field tot_len (iph->tot_len) is used as input to perform the MTU check. This patch extend the BPF-helper bpf_check_mtu() with the same ability to provide the length as user parameter input, via mtu_len parameter. This still needs to be done before the bpf_check_mtu() helper API becomes frozen. [1] samples/bpf/xdp_fwd_kern.c Fixes: 34b2021cc616 ("bpf: Add BPF-helper for MTU checking") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/161521555850.3515614.6533850861569774444.stgit@firesoul --- include/uapi/linux/bpf.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 79c893310492..4ba4ef0ff63a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3850,7 +3850,7 @@ union bpf_attr { * * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) * Description - * Check ctx packet size against exceeding MTU of net device (based + * Check packet size against exceeding MTU of net device (based * on *ifindex*). This helper will likely be used in combination * with helpers that adjust/change the packet size. * @@ -3867,6 +3867,14 @@ union bpf_attr { * against the current net device. This is practical if this isn't * used prior to redirect. * + * On input *mtu_len* must be a valid pointer, else verifier will + * reject BPF program. If the value *mtu_len* is initialized to + * zero then the ctx packet size is use. When value *mtu_len* is + * provided as input this specify the L3 length that the MTU check + * is done against. Remember XDP and TC length operate at L2, but + * this value is L3 as this correlate to MTU and IP-header tot_len + * values which are L3 (similar behavior as bpf_fib_lookup). + * * The Linux kernel route table can configure MTUs on a more * specific per route level, which is not provided by this helper. * For route level MTU checks use the **bpf_fib_lookup**\ () @@ -3891,11 +3899,9 @@ union bpf_attr { * * On return *mtu_len* pointer contains the MTU value of the net * device. Remember the net device configured MTU is the L3 size, - * which is returned here and XDP and TX length operate at L2. + * which is returned here and XDP and TC length operate at L2. * Helper take this into account for you, but remember when using - * MTU value in your BPF-code. On input *mtu_len* must be a valid - * pointer and be initialized (to zero), else verifier will reject - * BPF program. + * MTU value in your BPF-code. * * Return * * 0 on success, and populate MTU value in *mtu_len* pointer. -- cgit v1.2.3 From 05a68ce5fa51a83c360381630f823545c5757aa2 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 9 Mar 2021 10:50:28 -0800 Subject: bpf: Don't do bpf_cgroup_storage_set() for kuprobe/tp programs For kuprobe and tracepoint bpf programs, kernel calls trace_call_bpf() which calls BPF_PROG_RUN_ARRAY_CHECK() to run the program array. Currently, BPF_PROG_RUN_ARRAY_CHECK() also calls bpf_cgroup_storage_set() to set percpu cgroup local storage with NULL value. This is due to Commit 394e40a29788 ("bpf: extend bpf_prog_array to store pointers to the cgroup storage") which modified __BPF_PROG_RUN_ARRAY() to call bpf_cgroup_storage_set() and this macro is also used by BPF_PROG_RUN_ARRAY_CHECK(). kuprobe and tracepoint programs are not allowed to call bpf_get_local_storage() helper hence does not access percpu cgroup local storage. Let us change BPF_PROG_RUN_ARRAY_CHECK() not to modify percpu cgroup local storage. The issue is observed when I tried to debug [1] where percpu data is overwritten due to preempt_disable -> migration_disable change. This patch does not completely fix the above issue, which will be addressed separately, e.g., multiple cgroup prog runs may preempt each other. But it does fix any potential issue caused by tracing program overwriting percpu cgroup storage: - in a busy system, a tracing program is to run between bpf_cgroup_storage_set() and the cgroup prog run. - a kprobe program is triggered by a helper in cgroup prog before bpf_get_local_storage() is called. [1] https://lore.kernel.org/bpf/CAKH8qBuXCfUz=w8L+Fj74OaUpbosO29niYwTki7e3Ag044_aww@mail.gmail.com/T Fixes: 394e40a29788 ("bpf: extend bpf_prog_array to store pointers to the cgroup storage") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Roman Gushchin Link: https://lore.kernel.org/bpf/20210309185028.3763817-1-yhs@fb.com --- include/linux/bpf.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cccaef1088ea..d7e0f479a5b0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1093,7 +1093,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, _ret; \ }) -#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null) \ +#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage) \ ({ \ struct bpf_prog_array_item *_item; \ struct bpf_prog *_prog; \ @@ -1106,7 +1106,8 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, goto _out; \ _item = &_array->items[0]; \ while ((_prog = READ_ONCE(_item->prog))) { \ - bpf_cgroup_storage_set(_item->cgroup_storage); \ + if (set_cg_storage) \ + bpf_cgroup_storage_set(_item->cgroup_storage); \ _ret &= func(_prog, ctx); \ _item++; \ } \ @@ -1153,10 +1154,10 @@ _out: \ }) #define BPF_PROG_RUN_ARRAY(array, ctx, func) \ - __BPF_PROG_RUN_ARRAY(array, ctx, func, false) + __BPF_PROG_RUN_ARRAY(array, ctx, func, false, true) #define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func) \ - __BPF_PROG_RUN_ARRAY(array, ctx, func, true) + __BPF_PROG_RUN_ARRAY(array, ctx, func, true, false) #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); -- cgit v1.2.3 From 4806f1e2fee84c053cb68cd5be5817170bf0aab6 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Wed, 3 Mar 2021 14:36:16 +0200 Subject: net/mlx5: Set QP timestamp mode to default QPs which don't care from timestamp mode, should set the ts_format to default, otherwise the QP creation could be failed if the timestamp mode is not supported. Fixes: 2fe8d4b87802 ("RDMA/mlx5: Fail QP creation if the device can not support the CQE TS") Signed-off-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- include/linux/mlx5/qp.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index d75ef8aa8fac..b7deb790f257 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -547,4 +547,11 @@ static inline const char *mlx5_qp_state_str(int state) } } +static inline int mlx5_get_qp_default_ts(struct mlx5_core_dev *dev) +{ + return !MLX5_CAP_ROCE(dev, qp_ts_format) ? + MLX5_QPC_TIMESTAMP_FORMAT_FREE_RUNNING : + MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT; +} + #endif /* MLX5_QP_H */ -- cgit v1.2.3 From c4c877b2732466b4c63217baad05c96f775912c7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 10 Mar 2021 01:38:09 +0100 Subject: net: Consolidate common blackhole dst ops Move generic blackhole dst ops to the core and use them from both ipv4_dst_blackhole_ops and ip6_dst_blackhole_ops where possible. No functional change otherwise. We need these also in other locations and having to define them over and over again is not great. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/dst.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 26f134ad3a25..75b1e734e9c2 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -550,4 +550,15 @@ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, false); } +struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); +void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu, bool confirm_neigh); +void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb); +u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old); +struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr); +unsigned int dst_blackhole_mtu(const struct dst_entry *dst); + #endif /* _NET_DST_H */ -- cgit v1.2.3 From 28259bac7f1dde06d8ba324e222bbec9d4e92f2b Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 9 Mar 2021 18:20:35 -0800 Subject: ipv6: fix suspecious RCU usage warning Syzbot reported the suspecious RCU usage in nexthop_fib6_nh() when called from ipv6_route_seq_show(). The reason is ipv6_route_seq_start() calls rcu_read_lock_bh(), while nexthop_fib6_nh() calls rcu_dereference_rtnl(). The fix proposed is to add a variant of nexthop_fib6_nh() to use rcu_dereference_bh_rtnl() for ipv6_route_seq_show(). The reported trace is as follows: ./include/net/nexthop.h:416 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 2 locks held by syz-executor.0/17895: at: seq_read+0x71/0x12a0 fs/seq_file.c:169 at: seq_file_net include/linux/seq_file_net.h:19 [inline] at: ipv6_route_seq_start+0xaf/0x300 net/ipv6/ip6_fib.c:2616 stack backtrace: CPU: 1 PID: 17895 Comm: syz-executor.0 Not tainted 4.15.0-syzkaller #0 Call Trace: [] __dump_stack lib/dump_stack.c:17 [inline] [] dump_stack+0xd8/0x147 lib/dump_stack.c:53 [] lockdep_rcu_suspicious+0x153/0x15d kernel/locking/lockdep.c:5745 [] nexthop_fib6_nh include/net/nexthop.h:416 [inline] [] ipv6_route_native_seq_show net/ipv6/ip6_fib.c:2488 [inline] [] ipv6_route_seq_show+0x436/0x7a0 net/ipv6/ip6_fib.c:2673 [] seq_read+0xccf/0x12a0 fs/seq_file.c:276 [] proc_reg_read+0x10c/0x1d0 fs/proc/inode.c:231 [] do_loop_readv_writev fs/read_write.c:714 [inline] [] do_loop_readv_writev fs/read_write.c:701 [inline] [] do_iter_read+0x49e/0x660 fs/read_write.c:935 [] vfs_readv+0xfb/0x170 fs/read_write.c:997 [] kernel_readv fs/splice.c:361 [inline] [] default_file_splice_read+0x487/0x9c0 fs/splice.c:416 [] do_splice_to+0x129/0x190 fs/splice.c:879 [] splice_direct_to_actor+0x256/0x890 fs/splice.c:951 [] do_splice_direct+0x1dd/0x2b0 fs/splice.c:1060 [] do_sendfile+0x597/0xce0 fs/read_write.c:1459 [] SYSC_sendfile64 fs/read_write.c:1520 [inline] [] SyS_sendfile64+0x155/0x170 fs/read_write.c:1506 [] do_syscall_64+0x1ff/0x310 arch/x86/entry/common.c:305 [] entry_SYSCALL_64_after_hwframe+0x42/0xb7 Fixes: f88d8ea67fbdb ("ipv6: Plumb support for nexthop object in a fib6_info") Reported-by: syzbot Signed-off-by: Wei Wang Cc: David Ahern Cc: Ido Schimmel Cc: Petr Machata Cc: Eric Dumazet Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/nexthop.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 7bc057aee40b..a10a319d7eb2 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -410,6 +410,7 @@ static inline struct fib_nh *fib_info_nh(struct fib_info *fi, int nhsel) int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, struct netlink_ext_ack *extack); +/* Caller should either hold rcu_read_lock(), or RTNL. */ static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh) { struct nh_info *nhi; @@ -430,6 +431,29 @@ static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh) return NULL; } +/* Variant of nexthop_fib6_nh(). + * Caller should either hold rcu_read_lock_bh(), or RTNL. + */ +static inline struct fib6_nh *nexthop_fib6_nh_bh(struct nexthop *nh) +{ + struct nh_info *nhi; + + if (nh->is_group) { + struct nh_group *nh_grp; + + nh_grp = rcu_dereference_bh_rtnl(nh->nh_grp); + nh = nexthop_mpath_select(nh_grp, 0); + if (!nh) + return NULL; + } + + nhi = rcu_dereference_bh_rtnl(nh->nh_info); + if (nhi->family == AF_INET6) + return &nhi->fib6_nh; + + return NULL; +} + static inline struct net_device *fib6_info_nh_dev(struct fib6_info *f6i) { struct fib6_nh *fib6_nh; -- cgit v1.2.3 From dd4fa1dae9f4847cc1fd78ca468ad69e16e5db3e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 10 Mar 2021 01:56:36 -0800 Subject: macvlan: macvlan_count_rx() needs to be aware of preemption macvlan_count_rx() can be called from process context, it is thus necessary to disable preemption before calling u64_stats_update_begin() syzbot was able to spot this on 32bit arch: WARNING: CPU: 1 PID: 4632 at include/linux/seqlock.h:271 __seqprop_assert include/linux/seqlock.h:271 [inline] WARNING: CPU: 1 PID: 4632 at include/linux/seqlock.h:271 __seqprop_assert.constprop.0+0xf0/0x11c include/linux/seqlock.h:269 Modules linked in: Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 4632 Comm: kworker/1:3 Not tainted 5.12.0-rc2-syzkaller #0 Hardware name: ARM-Versatile Express Workqueue: events macvlan_process_broadcast Backtrace: [<82740468>] (dump_backtrace) from [<827406dc>] (show_stack+0x18/0x1c arch/arm/kernel/traps.c:252) r7:00000080 r6:60000093 r5:00000000 r4:8422a3c4 [<827406c4>] (show_stack) from [<82751b58>] (__dump_stack lib/dump_stack.c:79 [inline]) [<827406c4>] (show_stack) from [<82751b58>] (dump_stack+0xb8/0xe8 lib/dump_stack.c:120) [<82751aa0>] (dump_stack) from [<82741270>] (panic+0x130/0x378 kernel/panic.c:231) r7:830209b4 r6:84069ea4 r5:00000000 r4:844350d0 [<82741140>] (panic) from [<80244924>] (__warn+0xb0/0x164 kernel/panic.c:605) r3:8404ec8c r2:00000000 r1:00000000 r0:830209b4 r7:0000010f [<80244874>] (__warn) from [<82741520>] (warn_slowpath_fmt+0x68/0xd4 kernel/panic.c:628) r7:81363f70 r6:0000010f r5:83018e50 r4:00000000 [<827414bc>] (warn_slowpath_fmt) from [<81363f70>] (__seqprop_assert include/linux/seqlock.h:271 [inline]) [<827414bc>] (warn_slowpath_fmt) from [<81363f70>] (__seqprop_assert.constprop.0+0xf0/0x11c include/linux/seqlock.h:269) r8:5a109000 r7:0000000f r6:a568dac0 r5:89802300 r4:00000001 [<81363e80>] (__seqprop_assert.constprop.0) from [<81364af0>] (u64_stats_update_begin include/linux/u64_stats_sync.h:128 [inline]) [<81363e80>] (__seqprop_assert.constprop.0) from [<81364af0>] (macvlan_count_rx include/linux/if_macvlan.h:47 [inline]) [<81363e80>] (__seqprop_assert.constprop.0) from [<81364af0>] (macvlan_broadcast+0x154/0x26c drivers/net/macvlan.c:291) r5:89802300 r4:8a927740 [<8136499c>] (macvlan_broadcast) from [<81365020>] (macvlan_process_broadcast+0x258/0x2d0 drivers/net/macvlan.c:317) r10:81364f78 r9:8a86d000 r8:8a9c7e7c r7:8413aa5c r6:00000000 r5:00000000 r4:89802840 [<81364dc8>] (macvlan_process_broadcast) from [<802696a4>] (process_one_work+0x2d4/0x998 kernel/workqueue.c:2275) r10:00000008 r9:8404ec98 r8:84367a02 r7:ddfe6400 r6:ddfe2d40 r5:898dac80 r4:8a86d43c [<802693d0>] (process_one_work) from [<80269dcc>] (worker_thread+0x64/0x54c kernel/workqueue.c:2421) r10:00000008 r9:8a9c6000 r8:84006d00 r7:ddfe2d78 r6:898dac94 r5:ddfe2d40 r4:898dac80 [<80269d68>] (worker_thread) from [<80271f40>] (kthread+0x184/0x1a4 kernel/kthread.c:292) r10:85247e64 r9:898dac80 r8:80269d68 r7:00000000 r6:8a9c6000 r5:89a2ee40 r4:8a97bd00 [<80271dbc>] (kthread) from [<80200114>] (ret_from_fork+0x14/0x20 arch/arm/kernel/entry-common.S:158) Exception stack(0x8a9c7fb0 to 0x8a9c7ff8) Fixes: 412ca1550cbe ("macvlan: Move broadcasts into a work queue") Signed-off-by: Eric Dumazet Cc: Herbert Xu Reported-by: syzbot Acked-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/if_macvlan.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 96556c64c95d..10c94a3936ca 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -43,13 +43,14 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan, if (likely(success)) { struct vlan_pcpu_stats *pcpu_stats; - pcpu_stats = this_cpu_ptr(vlan->pcpu_stats); + pcpu_stats = get_cpu_ptr(vlan->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); pcpu_stats->rx_packets++; pcpu_stats->rx_bytes += len; if (multicast) pcpu_stats->rx_multicast++; u64_stats_update_end(&pcpu_stats->syncp); + put_cpu_ptr(vlan->pcpu_stats); } else { this_cpu_inc(vlan->pcpu_stats->rx_errors); } -- cgit v1.2.3 From e323d865b36134e8c5c82c834df89109a5c60dab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 10 Mar 2021 08:26:41 -0800 Subject: net: sched: validate stab values iproute2 package is well behaved, but malicious user space can provide illegal shift values and trigger UBSAN reports. Add stab parameter to red_check_params() to validate user input. syzbot reported: UBSAN: shift-out-of-bounds in ./include/net/red.h:312:18 shift exponent 111 is too large for 64-bit type 'long unsigned int' CPU: 1 PID: 14662 Comm: syz-executor.3 Not tainted 5.12.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x141/0x1d7 lib/dump_stack.c:120 ubsan_epilogue+0xb/0x5a lib/ubsan.c:148 __ubsan_handle_shift_out_of_bounds.cold+0xb1/0x181 lib/ubsan.c:327 red_calc_qavg_from_idle_time include/net/red.h:312 [inline] red_calc_qavg include/net/red.h:353 [inline] choke_enqueue.cold+0x18/0x3dd net/sched/sch_choke.c:221 __dev_xmit_skb net/core/dev.c:3837 [inline] __dev_queue_xmit+0x1943/0x2e00 net/core/dev.c:4150 neigh_hh_output include/net/neighbour.h:499 [inline] neigh_output include/net/neighbour.h:508 [inline] ip6_finish_output2+0x911/0x1700 net/ipv6/ip6_output.c:117 __ip6_finish_output net/ipv6/ip6_output.c:182 [inline] __ip6_finish_output+0x4c1/0xe10 net/ipv6/ip6_output.c:161 ip6_finish_output+0x35/0x200 net/ipv6/ip6_output.c:192 NF_HOOK_COND include/linux/netfilter.h:290 [inline] ip6_output+0x1e4/0x530 net/ipv6/ip6_output.c:215 dst_output include/net/dst.h:448 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] NF_HOOK include/linux/netfilter.h:295 [inline] ip6_xmit+0x127e/0x1eb0 net/ipv6/ip6_output.c:320 inet6_csk_xmit+0x358/0x630 net/ipv6/inet6_connection_sock.c:135 dccp_transmit_skb+0x973/0x12c0 net/dccp/output.c:138 dccp_send_reset+0x21b/0x2b0 net/dccp/output.c:535 dccp_finish_passive_close net/dccp/proto.c:123 [inline] dccp_finish_passive_close+0xed/0x140 net/dccp/proto.c:118 dccp_terminate_connection net/dccp/proto.c:958 [inline] dccp_close+0xb3c/0xe60 net/dccp/proto.c:1028 inet_release+0x12e/0x280 net/ipv4/af_inet.c:431 inet6_release+0x4c/0x70 net/ipv6/af_inet6.c:478 __sock_release+0xcd/0x280 net/socket.c:599 sock_close+0x18/0x20 net/socket.c:1258 __fput+0x288/0x920 fs/file_table.c:280 task_work_run+0xdd/0x1a0 kernel/task_work.c:140 tracehook_notify_resume include/linux/tracehook.h:189 [inline] Fixes: 8afa10cbe281 ("net_sched: red: Avoid illegal values") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/red.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/red.h b/include/net/red.h index 932f0d79d60c..9e6647c4ccd1 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -168,7 +168,8 @@ static inline void red_set_vars(struct red_vars *v) v->qcount = -1; } -static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog, u8 Scell_log) +static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog, + u8 Scell_log, u8 *stab) { if (fls(qth_min) + Wlog > 32) return false; @@ -178,6 +179,13 @@ static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog, u8 Scell_ return false; if (qth_max < qth_min) return false; + if (stab) { + int i; + + for (i = 0; i < RED_STAB_SIZE; i++) + if (stab[i] >= 32) + return false; + } return true; } -- cgit v1.2.3 From f211ac154577ec9ccf07c15f18a6abf0d9bdb4ab Mon Sep 17 00:00:00 2001 From: liuyacan Date: Fri, 12 Mar 2021 00:32:25 +0800 Subject: net: correct sk_acceptq_is_full() The "backlog" argument in listen() specifies the maximom length of pending connections, so the accept queue should be considered full if there are exactly "backlog" elements. Signed-off-by: liuyacan Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 636810ddcd9b..0b6266fd6bf6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -936,7 +936,7 @@ static inline void sk_acceptq_added(struct sock *sk) static inline bool sk_acceptq_is_full(const struct sock *sk) { - return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog); + return READ_ONCE(sk->sk_ack_backlog) >= READ_ONCE(sk->sk_max_ack_backlog); } /* -- cgit v1.2.3 From c9570d4a5efd04479b3cd09c39b571eb031d94f4 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 31 Dec 2020 09:52:52 +0100 Subject: extcon: Add stubs for extcon_register_notifier_all() functions Add stubs for extcon_register_notifier_all() function for !CONFIG_EXTCON case. This is useful for compile testing and for drivers which use EXTCON but do not require it (therefore do not depend on CONFIG_EXTCON). Fixes: 815429b39d94 ("extcon: Add new extcon_register_notifier_all() to monitor all external connectors") Reported-by: kernel test robot Signed-off-by: Krzysztof Kozlowski Signed-off-by: Chanwoo Choi --- include/linux/extcon.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/extcon.h b/include/linux/extcon.h index fd183fb9c20f..0c19010da77f 100644 --- a/include/linux/extcon.h +++ b/include/linux/extcon.h @@ -271,6 +271,29 @@ static inline void devm_extcon_unregister_notifier(struct device *dev, struct extcon_dev *edev, unsigned int id, struct notifier_block *nb) { } +static inline int extcon_register_notifier_all(struct extcon_dev *edev, + struct notifier_block *nb) +{ + return 0; +} + +static inline int extcon_unregister_notifier_all(struct extcon_dev *edev, + struct notifier_block *nb) +{ + return 0; +} + +static inline int devm_extcon_register_notifier_all(struct device *dev, + struct extcon_dev *edev, + struct notifier_block *nb) +{ + return 0; +} + +static inline void devm_extcon_unregister_notifier_all(struct device *dev, + struct extcon_dev *edev, + struct notifier_block *nb) { } + static inline struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name) { return ERR_PTR(-ENODEV); -- cgit v1.2.3 From d3d40f237480abf3268956daf18cdc56edd32834 Mon Sep 17 00:00:00 2001 From: Mark Tomlinson Date: Mon, 8 Mar 2021 14:24:12 +1300 Subject: Revert "netfilter: x_tables: Switch synchronization to RCU" This reverts commit cc00bcaa589914096edef7fb87ca5cee4a166b5c. This (and the preceding) patch basically re-implemented the RCU mechanisms of patch 784544739a25. That patch was replaced because of the performance problems that it created when replacing tables. Now, we have the same issue: the call to synchronize_rcu() makes replacing tables slower by as much as an order of magnitude. Prior to using RCU a script calling "iptables" approx. 200 times was taking 1.16s. With RCU this increased to 11.59s. Revert these patches and fix the issue in a different way. Signed-off-by: Mark Tomlinson Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 8ebb64193757..5deb099d156d 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -227,7 +227,7 @@ struct xt_table { unsigned int valid_hooks; /* Man behind the curtain... */ - struct xt_table_info __rcu *private; + struct xt_table_info *private; /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; @@ -448,9 +448,6 @@ xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu) struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *); -struct xt_table_info -*xt_table_get_private_protected(const struct xt_table *table); - #ifdef CONFIG_COMPAT #include -- cgit v1.2.3 From 175e476b8cdf2a4de7432583b49c871345e4f8a1 Mon Sep 17 00:00:00 2001 From: Mark Tomlinson Date: Mon, 8 Mar 2021 14:24:13 +1300 Subject: netfilter: x_tables: Use correct memory barriers. When a new table value was assigned, it was followed by a write memory barrier. This ensured that all writes before this point would complete before any writes after this point. However, to determine whether the rules are unused, the sequence counter is read. To ensure that all writes have been done before these reads, a full memory barrier is needed, not just a write memory barrier. The same argument applies when incrementing the counter, before the rules are read. Changing to using smp_mb() instead of smp_wmb() fixes the kernel panic reported in cc00bcaa5899 (which is still present), while still maintaining the same speed of replacing tables. The smb_mb() barriers potentially slow the packet path, however testing has shown no measurable change in performance on a 4-core MIPS64 platform. Fixes: 7f5c6d4f665b ("netfilter: get rid of atomic ops in fast path") Signed-off-by: Mark Tomlinson Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 5deb099d156d..8ec48466410a 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -376,7 +376,7 @@ static inline unsigned int xt_write_recseq_begin(void) * since addend is most likely 1 */ __this_cpu_add(xt_recseq.sequence, addend); - smp_wmb(); + smp_mb(); return addend; } -- cgit v1.2.3 From 7233da86697efef41288f8b713c10c2499cffe85 Mon Sep 17 00:00:00 2001 From: Alexander Ovechkin Date: Mon, 15 Mar 2021 14:05:45 +0300 Subject: tcp: relookup sock for RST+ACK packets handled by obsolete req sock Currently tcp_check_req can be called with obsolete req socket for which big socket have been already created (because of CPU race or early demux assigning req socket to multiple packets in gro batch). Commit e0f9759f530bf789e984 ("tcp: try to keep packet if SYN_RCV race is lost") added retry in case when tcp_check_req is called for PSH|ACK packet. But if client sends RST+ACK immediatly after connection being established (it is performing healthcheck, for example) retry does not occur. In that case tcp_check_req tries to close req socket, leaving big socket active. Fixes: e0f9759f530 ("tcp: try to keep packet if SYN_RCV race is lost") Signed-off-by: Alexander Ovechkin Reported-by: Oleg Senin Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 10a625760de9..3c8c59471bc1 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -282,7 +282,7 @@ static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) return inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog; } -void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); +bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req); static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk) -- cgit v1.2.3 From 3a5ca857079ea022e0b1b17fc154f7ad7dbc150f Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Tue, 2 Mar 2021 13:24:23 +0100 Subject: can: dev: Move device back to init netns on owning netns delete When a non-initial netns is destroyed, the usual policy is to delete all virtual network interfaces contained, but move physical interfaces back to the initial netns. This keeps the physical interface visible on the system. CAN devices are somewhat special, as they define rtnl_link_ops even if they are physical devices. If a CAN interface is moved into a non-initial netns, destroying that netns lets the interface vanish instead of moving it back to the initial netns. default_device_exit() skips CAN interfaces due to having rtnl_link_ops set. Reproducer: ip netns add foo ip link set can0 netns foo ip netns delete foo WARNING: CPU: 1 PID: 84 at net/core/dev.c:11030 ops_exit_list+0x38/0x60 CPU: 1 PID: 84 Comm: kworker/u4:2 Not tainted 5.10.19 #1 Workqueue: netns cleanup_net [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (dump_stack+0x94/0xa8) [] (dump_stack) from [] (__warn+0xb8/0x114) [] (__warn) from [] (warn_slowpath_fmt+0x7c/0xac) [] (warn_slowpath_fmt) from [] (ops_exit_list+0x38/0x60) [] (ops_exit_list) from [] (cleanup_net+0x230/0x380) [] (cleanup_net) from [] (process_one_work+0x1d8/0x438) [] (process_one_work) from [] (worker_thread+0x64/0x5a8) [] (worker_thread) from [] (kthread+0x148/0x14c) [] (kthread) from [] (ret_from_fork+0x14/0x2c) To properly restore physical CAN devices to the initial netns on owning netns exit, introduce a flag on rtnl_link_ops that can be set by drivers. For CAN devices setting this flag, default_device_exit() considers them non-virtual, applying the usual namespace move. The issue was introduced in the commit mentioned below, as at that time CAN devices did not have a dellink() operation. Fixes: e008b5fc8dc7 ("net: Simplfy default_device_exit and improve batching.") Link: https://lore.kernel.org/r/20210302122423.872326-1-martin@strongswan.org Signed-off-by: Martin Willi Signed-off-by: Marc Kleine-Budde --- include/net/rtnetlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index e2091bb2b3a8..4da61c950e93 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -33,6 +33,7 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) * * @list: Used internally * @kind: Identifier + * @netns_refund: Physical device, move to init_net on netns exit * @maxtype: Highest device specific netlink attribute number * @policy: Netlink policy for device specific attribute validation * @validate: Optional validation function for netlink/changelink parameters @@ -64,6 +65,7 @@ struct rtnl_link_ops { size_t priv_size; void (*setup)(struct net_device *dev); + bool netns_refund; unsigned int maxtype; const struct nla_policy *policy; int (*validate)(struct nlattr *tb[], -- cgit v1.2.3 From d29334c15d33a6a92d2043ca88f84cd5ad026c57 Mon Sep 17 00:00:00 2001 From: wenxu Date: Tue, 16 Mar 2021 16:33:54 +0800 Subject: net/sched: act_api: fix miss set post_ct for ovs after do conntrack in act_ct When openvswitch conntrack offload with act_ct action. The first rule do conntrack in the act_ct in tc subsystem. And miss the next rule in the tc and fallback to the ovs datapath but miss set post_ct flag which will lead the ct_state_key with -trk flag. Fixes: 7baf2429a1a9 ("net/sched: cls_flower add CT_FLAGS_INVALID flag support") Signed-off-by: wenxu Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6d0a33d1c0db..f2c9ee71cb2c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -285,6 +285,7 @@ struct nf_bridge_info { struct tc_skb_ext { __u32 chain; __u16 mru; + bool post_ct; }; #endif -- cgit v1.2.3 From cb038357937ee4f589aab2469ec3896dce90f317 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 16 Mar 2021 15:36:47 -0700 Subject: net: fix race between napi kthread mode and busy poll Currently, napi_thread_wait() checks for NAPI_STATE_SCHED bit to determine if the kthread owns this napi and could call napi->poll() on it. However, if socket busy poll is enabled, it is possible that the busy poll thread grabs this SCHED bit (after the previous napi->poll() invokes napi_complete_done() and clears SCHED bit) and tries to poll on the same napi. napi_disable() could grab the SCHED bit as well. This patch tries to fix this race by adding a new bit NAPI_STATE_SCHED_THREADED in napi->state. This bit gets set in ____napi_schedule() if the threaded mode is enabled, and gets cleared in napi_complete_done(), and we only poll the napi in kthread if this bit is set. This helps distinguish the ownership of the napi between kthread and other scenarios and fixes the race issue. Fixes: 29863d41bb6e ("net: implement threaded-able napi poll loop support") Reported-by: Martin Zaharinov Suggested-by: Jakub Kicinski Signed-off-by: Wei Wang Cc: Alexander Duyck Cc: Eric Dumazet Cc: Paolo Abeni Cc: Hannes Frederic Sowa Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5b67ea89d5f2..87a5d186faff 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -360,6 +360,7 @@ enum { NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/ + NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */ }; enum { @@ -372,6 +373,7 @@ enum { NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL), NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), + NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED), }; enum gro_result { -- cgit v1.2.3 From e21aa341785c679dd409c8cb71f864c00fe6c463 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 16 Mar 2021 14:00:07 -0700 Subject: bpf: Fix fexit trampoline. The fexit/fmod_ret programs can be attached to kernel functions that can sleep. The synchronize_rcu_tasks() will not wait for such tasks to complete. In such case the trampoline image will be freed and when the task wakes up the return IP will point to freed memory causing the crash. Solve this by adding percpu_ref_get/put for the duration of trampoline and separate trampoline vs its image life times. The "half page" optimization has to be removed, since first_half->second_half->first_half transition cannot be guaranteed to complete in deterministic time. Every trampoline update becomes a new image. The image with fmod_ret or fexit progs will be freed via percpu_ref_kill and call_rcu_tasks. Together they will wait for the original function and trampoline asm to complete. The trampoline is patched from nop to jmp to skip fexit progs. They are freed independently from the trampoline. The image with fentry progs only will be freed via call_rcu_tasks_trace+call_rcu_tasks which will wait for both sleepable and non-sleepable progs to complete. Fixes: fec56f5890d9 ("bpf: Introduce BPF trampoline") Reported-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Paul E. McKenney # for RCU Link: https://lore.kernel.org/bpf/20210316210007.38949-1-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d7e0f479a5b0..3625f019767d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -21,6 +21,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -556,7 +557,8 @@ struct bpf_tramp_progs { * fentry = a set of program to run before calling original function * fexit = a set of program to run after original function */ -int arch_prepare_bpf_trampoline(void *image, void *image_end, +struct bpf_tramp_image; +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_progs *tprogs, void *orig_call); @@ -565,6 +567,8 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog); void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog); void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start); +void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); +void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); struct bpf_ksym { unsigned long start; @@ -583,6 +587,18 @@ enum bpf_tramp_prog_type { BPF_TRAMP_REPLACE, /* more than MAX */ }; +struct bpf_tramp_image { + void *image; + struct bpf_ksym ksym; + struct percpu_ref pcref; + void *ip_after_call; + void *ip_epilogue; + union { + struct rcu_head rcu; + struct work_struct work; + }; +}; + struct bpf_trampoline { /* hlist for trampoline_table */ struct hlist_node hlist; @@ -605,9 +621,8 @@ struct bpf_trampoline { /* Number of attached programs. A counter per kind. */ int progs_cnt[BPF_TRAMP_MAX]; /* Executable image of trampoline */ - void *image; + struct bpf_tramp_image *cur_image; u64 selector; - struct bpf_ksym ksym; }; struct bpf_attach_target_info { @@ -691,6 +706,8 @@ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); +int bpf_jit_charge_modmem(u32 pages); +void bpf_jit_uncharge_modmem(u32 pages); #else static inline int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) @@ -787,7 +804,6 @@ struct bpf_prog_aux { bool func_proto_unreliable; bool sleepable; bool tail_call_reachable; - enum bpf_tramp_prog_type trampoline_prog_type; struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; -- cgit v1.2.3 From 7b35582cd04ace2fd1807c1b624934e465cc939d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Mar 2021 12:54:57 +0100 Subject: netfilter: nftables: allow to update flowtable flags Honor flowtable flags from the control update path. Disallow disabling to toggle hardware offload support though. Fixes: 8bb69f3b2918 ("netfilter: nf_tables: add flowtable offload control plane") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index fdec57d862b7..5aaced6bf13e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1536,6 +1536,7 @@ struct nft_trans_flowtable { struct nft_flowtable *flowtable; bool update; struct list_head hook_list; + u32 flags; }; #define nft_trans_flowtable(trans) \ @@ -1544,6 +1545,8 @@ struct nft_trans_flowtable { (((struct nft_trans_flowtable *)trans->data)->update) #define nft_trans_flowtable_hooks(trans) \ (((struct nft_trans_flowtable *)trans->data)->hook_list) +#define nft_trans_flowtable_flags(trans) \ + (((struct nft_trans_flowtable *)trans->data)->flags) int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); -- cgit v1.2.3 From 2e8496f31d0be8f43849b2980b069f3a9805d047 Mon Sep 17 00:00:00 2001 From: Richard Gong Date: Tue, 9 Feb 2021 16:20:27 -0600 Subject: firmware: stratix10-svc: reset COMMAND_RECONFIG_FLAG_PARTIAL to 0 Clean up COMMAND_RECONFIG_FLAG_PARTIAL flag by resetting it to 0, which aligns with the firmware settings. Fixes: 36847f9e3e56 ("firmware: stratix10-svc: correct reconfig flag and timeout values") Signed-off-by: Richard Gong Reviewed-by: Tom Rix Signed-off-by: Moritz Fischer --- include/linux/firmware/intel/stratix10-svc-client.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h index ebc295647581..19781b0f6429 100644 --- a/include/linux/firmware/intel/stratix10-svc-client.h +++ b/include/linux/firmware/intel/stratix10-svc-client.h @@ -56,7 +56,7 @@ * COMMAND_RECONFIG_FLAG_PARTIAL: * Set to FPGA configuration type (full or partial). */ -#define COMMAND_RECONFIG_FLAG_PARTIAL 1 +#define COMMAND_RECONFIG_FLAG_PARTIAL 0 /* * Timeout settings for service clients: -- cgit v1.2.3 From 8a2dc6af67a0c9f65a22ea40fc79974ee8f368c7 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Fri, 19 Mar 2021 10:16:23 +0530 Subject: sch_red: Fix a typo s/recalcultion/recalculation/ Signed-off-by: Bhaskar Chowdhury Signed-off-by: David S. Miller --- include/net/red.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/red.h b/include/net/red.h index 9e6647c4ccd1..0b39eff1d50a 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -295,7 +295,7 @@ static inline unsigned long red_calc_qavg_from_idle_time(const struct red_parms int shift; /* - * The problem: ideally, average length queue recalcultion should + * The problem: ideally, average length queue recalculation should * be done over constant clock intervals. This is too expensive, so * that the calculation is driven by outgoing packets. * When the queue is idle we have to model this clock by hand. -- cgit v1.2.3 From f60a85cad677c4f9bb4cadd764f1d106c38c7cf8 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 17 Mar 2021 11:09:15 +0800 Subject: bpf: Fix umd memory leak in copy_process() The syzbot reported a memleak as follows: BUG: memory leak unreferenced object 0xffff888101b41d00 (size 120): comm "kworker/u4:0", pid 8, jiffies 4294944270 (age 12.780s) backtrace: [] alloc_pid+0x66/0x560 [] copy_process+0x1465/0x25e0 [] kernel_clone+0xf3/0x670 [] kernel_thread+0x61/0x80 [] call_usermodehelper_exec_work [] call_usermodehelper_exec_work+0xc4/0x120 [] process_one_work+0x2c9/0x600 [] worker_thread+0x59/0x5d0 [] kthread+0x178/0x1b0 [] ret_from_fork+0x1f/0x30 unreferenced object 0xffff888110ef5c00 (size 232): comm "kworker/u4:0", pid 8414, jiffies 4294944270 (age 12.780s) backtrace: [] kmem_cache_zalloc [] __alloc_file+0x1f/0xf0 [] alloc_empty_file+0x69/0x120 [] alloc_file+0x33/0x1b0 [] alloc_file_pseudo+0xb2/0x140 [] create_pipe_files+0x138/0x2e0 [] umd_setup+0x33/0x220 [] call_usermodehelper_exec_async+0xb4/0x1b0 [] ret_from_fork+0x1f/0x30 After the UMD process exits, the pipe_to_umh/pipe_from_umh and tgid need to be released. Fixes: d71fa5c9763c ("bpf: Add kernel module with user mode driver that populates bpffs.") Reported-by: syzbot+44908bb56d2bfe56b28e@syzkaller.appspotmail.com Signed-off-by: Zqiang Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210317030915.2865-1-qiang.zhang@windriver.com --- include/linux/usermode_driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h index 073a9e0ec07d..ad970416260d 100644 --- a/include/linux/usermode_driver.h +++ b/include/linux/usermode_driver.h @@ -14,5 +14,6 @@ struct umd_info { int umd_load_blob(struct umd_info *info, const void *data, size_t len); int umd_unload_blob(struct umd_info *info); int fork_usermode_driver(struct umd_info *info); +void umd_cleanup_helper(struct umd_info *info); #endif /* __LINUX_USERMODE_DRIVER_H__ */ -- cgit v1.2.3 From 2d669ceb69c276f7637cf760287ca4187add082e Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Tue, 16 Mar 2021 13:36:02 +0900 Subject: dm table: Fix zoned model check and zone sectors check Commit 24f6b6036c9e ("dm table: fix zoned iterate_devices based device capability checks") triggered dm table load failure when dm-zoned device is set up for zoned block devices and a regular device for cache. The commit inverted logic of two callback functions for iterate_devices: device_is_zoned_model() and device_matches_zone_sectors(). The logic of device_is_zoned_model() was inverted then all destination devices of all targets in dm table are required to have the expected zoned model. This is fine for dm-linear, dm-flakey and dm-crypt on zoned block devices since each target has only one destination device. However, this results in failure for dm-zoned with regular cache device since that target has both regular block device and zoned block devices. As for device_matches_zone_sectors(), the commit inverted the logic to require all zoned block devices in each target have the specified zone_sectors. This check also fails for regular block device which does not have zones. To avoid the check failures, fix the zone model check and the zone sectors check. For zone model check, introduce the new feature flag DM_TARGET_MIXED_ZONED_MODEL, and set it to dm-zoned target. When the target has this flag, allow it to have destination devices with any zoned model. For zone sectors check, skip the check if the destination device is not a zoned block device. Also add comments and improve an error message to clarify expectations to the two checks. Fixes: 24f6b6036c9e ("dm table: fix zoned iterate_devices based device capability checks") Signed-off-by: Shin'ichiro Kawasaki Signed-off-by: Damien Le Moal Signed-off-by: Mike Snitzer --- include/linux/device-mapper.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 7f4ac87c0b32..5c641f930caf 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -253,7 +253,11 @@ struct target_type { #define dm_target_passes_integrity(type) ((type)->features & DM_TARGET_PASSES_INTEGRITY) /* - * Indicates that a target supports host-managed zoned block devices. + * Indicates support for zoned block devices: + * - DM_TARGET_ZONED_HM: the target also supports host-managed zoned + * block devices but does not support combining different zoned models. + * - DM_TARGET_MIXED_ZONED_MODEL: the target supports combining multiple + * devices with different zoned models. */ #ifdef CONFIG_BLK_DEV_ZONED #define DM_TARGET_ZONED_HM 0x00000040 @@ -275,6 +279,15 @@ struct target_type { #define DM_TARGET_PASSES_CRYPTO 0x00000100 #define dm_target_passes_crypto(type) ((type)->features & DM_TARGET_PASSES_CRYPTO) +#ifdef CONFIG_BLK_DEV_ZONED +#define DM_TARGET_MIXED_ZONED_MODEL 0x00000200 +#define dm_target_supports_mixed_zoned_model(type) \ + ((type)->features & DM_TARGET_MIXED_ZONED_MODEL) +#else +#define DM_TARGET_MIXED_ZONED_MODEL 0x00000000 +#define dm_target_supports_mixed_zoned_model(type) (false) +#endif + struct dm_target { struct dm_table *table; struct target_type *type; -- cgit v1.2.3 From eb50aaf960e3bedfef79063411ffd670da94b84b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 22 Mar 2021 18:31:00 +0200 Subject: ACPI: scan: Use unique number for instance_no The decrementation of acpi_device_bus_id->instance_no in acpi_device_del() is incorrect, because it may cause a duplicate instance number to be allocated next time a device with the same acpi_device_bus_id is added. Replace above mentioned approach by using IDA framework. While at it, define the instance range to be [0, 4096). Fixes: e49bd2dd5a50 ("ACPI: use PNPID:instance_no as bus_id of ACPI device") Fixes: ca9dc8d42b30 ("ACPI / scan: Fix acpi_bus_id_list bookkeeping") Signed-off-by: Andy Shevchenko Cc: 4.10+ # 4.10+ Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_bus.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 02a716a0af5d..f28b097c658f 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -233,6 +233,7 @@ struct acpi_pnp_type { struct acpi_device_pnp { acpi_bus_id bus_id; /* Object name */ + int instance_no; /* Instance number of this object */ struct acpi_pnp_type type; /* ID type */ acpi_bus_address bus_address; /* _ADR */ char *unique_id; /* _UID */ -- cgit v1.2.3 From 291da9d4a9eb3a1cb0610b7f4480f5b52b1825e7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Mar 2021 09:46:13 +0100 Subject: locking/mutex: Fix non debug version of mutex_lock_io_nested() If CONFIG_DEBUG_LOCK_ALLOC=n then mutex_lock_io_nested() maps to mutex_lock() which is clearly wrong because mutex_lock() lacks the io_schedule_prepare()/finish() invocations. Map it to mutex_lock_io(). Fixes: f21860bac05b ("locking/mutex, sched/wait: Fix the mutex_lock_io_nested() define") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/878s6fshii.fsf@nanos.tec.linutronix.de --- include/linux/mutex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 0cd631a19727..515cff77a4f4 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -185,7 +185,7 @@ extern void mutex_lock_io(struct mutex *lock); # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock) # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock) # define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock) -# define mutex_lock_io_nested(lock, subclass) mutex_lock(lock) +# define mutex_lock_io_nested(lock, subclass) mutex_lock_io(lock) #endif /* -- cgit v1.2.3 From 39f985c8f667c80a3d1eb19d31138032fa36b09e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 20 Mar 2021 05:40:38 +0000 Subject: fs/cachefiles: Remove wait_bit_key layout dependency Cachefiles was relying on wait_page_key and wait_bit_key being the same layout, which is fragile. Now that wait_page_key is exposed in the pagemap.h header, we can remove that fragility A comment on the need to maintain structure layout equivalence was added by Linus[1] and that is no longer applicable. Fixes: 62906027091f ("mm: add PageWaiters indicating tasks are waiting for a page bit") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: David Howells Tested-by: kafs-testing@auristor.com cc: linux-cachefs@redhat.com cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/20210320054104.1300774-2-willy@infradead.org/ Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3510ca20ece0150af6b10c77a74ff1b5c198e3e2 [1] --- include/linux/pagemap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 20225b067583..8f4daac6eb4b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -559,7 +559,6 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma, return pgoff; } -/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */ struct wait_page_key { struct page *page; int bit_nr; -- cgit v1.2.3 From e5dbd33218bd8d87ab69f730ab90aed5fab7eb26 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 20 Mar 2021 05:40:39 +0000 Subject: mm/writeback: Add wait_on_page_writeback_killable This is the killable version of wait_on_page_writeback. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: David Howells Tested-by: kafs-testing@auristor.com cc: linux-afs@lists.infradead.org cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/20210320054104.1300774-3-willy@infradead.org --- include/linux/pagemap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 8f4daac6eb4b..8c9947fd62f3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -682,6 +682,7 @@ static inline int wait_on_page_locked_killable(struct page *page) int put_and_wait_on_page_locked(struct page *page, int state); void wait_on_page_writeback(struct page *page); +int wait_on_page_writeback_killable(struct page *page); extern void end_page_writeback(struct page *page); void wait_for_stable_page(struct page *page); -- cgit v1.2.3 From e43accba9b071dcd106b5e7643b1b106a158cbb1 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 24 Mar 2021 21:43:32 +0200 Subject: psample: Fix user API breakage Cited commit added a new attribute before the existing group reference count attribute, thereby changing its value and breaking existing applications on new kernels. Before: # psample -l libpsample ERROR psample_group_foreach: failed to recv message: Operation not supported After: # psample -l Group Num Refcount Group Seq 1 1 0 Fix by restoring the value of the old attribute and remove the misleading comments from the enumerator to avoid future bugs. Cc: stable@vger.kernel.org Fixes: d8bed686ab96 ("net: psample: Add tunnel support") Signed-off-by: Ido Schimmel Reported-by: Adiel Bidani Reviewed-by: Jiri Pirko Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- include/uapi/linux/psample.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/psample.h b/include/uapi/linux/psample.h index aea26ab1431c..bff5032c98df 100644 --- a/include/uapi/linux/psample.h +++ b/include/uapi/linux/psample.h @@ -3,7 +3,6 @@ #define __UAPI_PSAMPLE_H enum { - /* sampled packet metadata */ PSAMPLE_ATTR_IIFINDEX, PSAMPLE_ATTR_OIFINDEX, PSAMPLE_ATTR_ORIGSIZE, @@ -11,10 +10,8 @@ enum { PSAMPLE_ATTR_GROUP_SEQ, PSAMPLE_ATTR_SAMPLE_RATE, PSAMPLE_ATTR_DATA, - PSAMPLE_ATTR_TUNNEL, - - /* commands attributes */ PSAMPLE_ATTR_GROUP_REFCOUNT, + PSAMPLE_ATTR_TUNNEL, __PSAMPLE_ATTR_MAX }; -- cgit v1.2.3 From d85aecf2844ff02a0e5f077252b2461d4f10c9f0 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Mar 2021 21:37:17 -0700 Subject: hugetlb_cgroup: fix imbalanced css_get and css_put pair for shared mappings The current implementation of hugetlb_cgroup for shared mappings could have different behavior. Consider the following two scenarios: 1.Assume initial css reference count of hugetlb_cgroup is 1: 1.1 Call hugetlb_reserve_pages with from = 1, to = 2. So css reference count is 2 associated with 1 file_region. 1.2 Call hugetlb_reserve_pages with from = 2, to = 3. So css reference count is 3 associated with 2 file_region. 1.3 coalesce_file_region will coalesce these two file_regions into one. So css reference count is 3 associated with 1 file_region now. 2.Assume initial css reference count of hugetlb_cgroup is 1 again: 2.1 Call hugetlb_reserve_pages with from = 1, to = 3. So css reference count is 2 associated with 1 file_region. Therefore, we might have one file_region while holding one or more css reference counts. This inconsistency could lead to imbalanced css_get() and css_put() pair. If we do css_put one by one (i.g. hole punch case), scenario 2 would put one more css reference. If we do css_put all together (i.g. truncate case), scenario 1 will leak one css reference. The imbalanced css_get() and css_put() pair would result in a non-zero reference when we try to destroy the hugetlb cgroup. The hugetlb cgroup directory is removed __but__ associated resource is not freed. This might result in OOM or can not create a new hugetlb cgroup in a busy workload ultimately. In order to fix this, we have to make sure that one file_region must hold exactly one css reference. So in coalesce_file_region case, we should release one css reference before coalescence. Also only put css reference when the entire file_region is removed. The last thing to note is that the caller of region_add() will only hold one reference to h_cg->css for the whole contiguous reservation region. But this area might be scattered when there are already some file_regions reside in it. As a result, many file_regions may share only one h_cg->css reference. In order to ensure that one file_region must hold exactly one css reference, we should do css_get() for each file_region and release the reference held by caller when they are done. [linmiaohe@huawei.com: fix imbalanced css_get and css_put pair for shared mappings] Link: https://lkml.kernel.org/r/20210316023002.53921-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210301120540.37076-1-linmiaohe@huawei.com Fixes: 075a61d07a8e ("hugetlb_cgroup: add accounting for shared mappings") Reported-by: kernel test robot (auto build test ERROR) Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Cc: Aneesh Kumar K.V Cc: Wanpeng Li Cc: Mina Almasry Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb_cgroup.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 2ad6e92f124a..0bff345c4bc6 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -113,6 +113,11 @@ static inline bool hugetlb_cgroup_disabled(void) return !cgroup_subsys_enabled(hugetlb_cgrp_subsys); } +static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg) +{ + css_put(&h_cg->css); +} + extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr); extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, @@ -138,7 +143,8 @@ extern void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, extern void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, struct file_region *rg, - unsigned long nr_pages); + unsigned long nr_pages, + bool region_del); extern void hugetlb_cgroup_file_init(void) __init; extern void hugetlb_cgroup_migrate(struct page *oldhpage, @@ -147,7 +153,8 @@ extern void hugetlb_cgroup_migrate(struct page *oldhpage, #else static inline void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, struct file_region *rg, - unsigned long nr_pages) + unsigned long nr_pages, + bool region_del) { } @@ -185,6 +192,10 @@ static inline bool hugetlb_cgroup_disabled(void) return true; } +static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg) +{ +} + static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { -- cgit v1.2.3 From cf10bd4c4aff8dd64d1aa7f2a529d0c672bc16af Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 24 Mar 2021 21:37:20 -0700 Subject: kasan: fix per-page tags for non-page_alloc pages To allow performing tag checks on page_alloc addresses obtained via page_address(), tag-based KASAN modes store tags for page_alloc allocations in page->flags. Currently, the default tag value stored in page->flags is 0x00. Therefore, page_address() returns a 0x00ffff... address for pages that were not allocated via page_alloc. This might cause problems. A particular case we encountered is a conflict with KFENCE. If a KFENCE-allocated slab object is being freed via kfree(page_address(page) + offset), the address passed to kfree() will get tagged with 0x00 (as slab pages keep the default per-page tags). This leads to is_kfence_address() check failing, and a KFENCE object ending up in normal slab freelist, which causes memory corruptions. This patch changes the way KASAN stores tag in page-flags: they are now stored xor'ed with 0xff. This way, KASAN doesn't need to initialize per-page flags for every created page, which might be slow. With this change, page_address() returns natively-tagged (with 0xff) pointers for pages that didn't have tags set explicitly. This patch fixes the encountered conflict with KFENCE and prevents more similar issues that can occur in the future. Link: https://lkml.kernel.org/r/1a41abb11c51b264511d9e71c303bb16d5cb367b.1615475452.git.andreyknvl@google.com Fixes: 2813b9c02962 ("kasan, mm, arm64: tag non slab memory allocated via pagealloc") Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Catalin Marinas Cc: Will Deacon Cc: Vincenzo Frascino Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Peter Collingbourne Cc: Evgenii Stepanov Cc: Branislav Rankov Cc: Kevin Brodsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 64a71bf20536..8ba434287387 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1461,16 +1461,28 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) +/* + * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid + * setting tags for all pages to native kernel tag value 0xff, as the default + * value 0x00 maps to 0xff. + */ + static inline u8 page_kasan_tag(const struct page *page) { - if (kasan_enabled()) - return (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; - return 0xff; + u8 tag = 0xff; + + if (kasan_enabled()) { + tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; + tag ^= 0xff; + } + + return tag; } static inline void page_kasan_tag_set(struct page *page, u8 tag) { if (kasan_enabled()) { + tag ^= 0xff; page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; } -- cgit v1.2.3 From c2655835fd8cabdfe7dab737253de3ffb88da126 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 24 Mar 2021 21:37:23 -0700 Subject: mm/mmu_notifiers: ensure range_end() is paired with range_start() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If one or more notifiers fails .invalidate_range_start(), invoke .invalidate_range_end() for "all" notifiers. If there are multiple notifiers, those that did not fail are expecting _start() and _end() to be paired, e.g. KVM's mmu_notifier_count would become imbalanced. Disallow notifiers that can fail _start() from implementing _end() so that it's unnecessary to either track which notifiers rejected _start(), or had already succeeded prior to a failed _start(). Note, the existing behavior of calling _start() on all notifiers even after a previous notifier failed _start() was an unintented "feature". Make it canon now that the behavior is depended on for correctness. As of today, the bug is likely benign: 1. The only caller of the non-blocking notifier is OOM kill. 2. The only notifiers that can fail _start() are the i915 and Nouveau drivers. 3. The only notifiers that utilize _end() are the SGI UV GRU driver and KVM. 4. The GRU driver will never coincide with the i195/Nouveau drivers. 5. An imbalanced kvm->mmu_notifier_count only causes soft lockup in the _guest_, and the guest is already doomed due to being an OOM victim. Fix the bug now to play nice with future usage, e.g. KVM has a potential use case for blocking memslot updates in KVM while an invalidation is in-progress, and failure to unblock would result in said updates being blocked indefinitely and hanging. Found by inspection. Verified by adding a second notifier in KVM that periodically returns -EAGAIN on non-blockable ranges, triggering OOM, and observing that KVM exits with an elevated notifier count. Link: https://lkml.kernel.org/r/20210311180057.1582638-1-seanjc@google.com Fixes: 93065ac753e4 ("mm, oom: distinguish blockable mode for mmu notifiers") Signed-off-by: Sean Christopherson Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Cc: David Rientjes Cc: Ben Gardon Cc: Michal Hocko Cc: "Jérôme Glisse" Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Dimitri Sivanich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index b8200782dede..1a6a9eb6d3fa 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -169,11 +169,11 @@ struct mmu_notifier_ops { * the last refcount is dropped. * * If blockable argument is set to false then the callback cannot - * sleep and has to return with -EAGAIN. 0 should be returned - * otherwise. Please note that if invalidate_range_start approves - * a non-blocking behavior then the same applies to - * invalidate_range_end. - * + * sleep and has to return with -EAGAIN if sleeping would be required. + * 0 should be returned otherwise. Please note that notifiers that can + * fail invalidate_range_start are not allowed to implement + * invalidate_range_end, as there is no mechanism for informing the + * notifier that its start failed. */ int (*invalidate_range_start)(struct mmu_notifier *subscription, const struct mmu_notifier_range *range); -- cgit v1.2.3 From a024b7c2850dddd01e65b8270f0971deaf272f27 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 24 Mar 2021 21:37:50 -0700 Subject: mm: memblock: fix section mismatch warning again Commit 34dc2efb39a2 ("memblock: fix section mismatch warning") marked memblock_bottom_up() and memblock_set_bottom_up() as __init, but they could be referenced from non-init functions like memblock_find_in_range_node() on architectures that enable CONFIG_ARCH_KEEP_MEMBLOCK. For such builds kernel test robot reports: WARNING: modpost: vmlinux.o(.text+0x74fea4): Section mismatch in reference from the function memblock_find_in_range_node() to the function .init.text:memblock_bottom_up() The function memblock_find_in_range_node() references the function __init memblock_bottom_up(). This is often because memblock_find_in_range_node lacks a __init annotation or the annotation of memblock_bottom_up is wrong. Replace __init annotations with __init_memblock annotations so that the appropriate section will be selected depending on CONFIG_ARCH_KEEP_MEMBLOCK. Link: https://lore.kernel.org/lkml/202103160133.UzhgY0wt-lkp@intel.com Link: https://lkml.kernel.org/r/20210316171347.14084-1-rppt@kernel.org Fixes: 34dc2efb39a2 ("memblock: fix section mismatch warning") Signed-off-by: Mike Rapoport Reviewed-by: Arnd Bergmann Reported-by: kernel test robot Reviewed-by: David Hildenbrand Acked-by: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index d13e3cd938b4..5984fff3f175 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -460,7 +460,7 @@ static inline void memblock_free_late(phys_addr_t base, phys_addr_t size) /* * Set the allocation direction to bottom-up or top-down. */ -static inline __init void memblock_set_bottom_up(bool enable) +static inline __init_memblock void memblock_set_bottom_up(bool enable) { memblock.bottom_up = enable; } @@ -470,7 +470,7 @@ static inline __init void memblock_set_bottom_up(bool enable) * if this is true, that said, memblock will allocate memory * in bottom-up direction. */ -static inline __init bool memblock_bottom_up(void) +static inline __init_memblock bool memblock_bottom_up(void) { return memblock.bottom_up; } -- cgit v1.2.3 From 29d96eb261345c8d888e248ae79484e681be2faa Mon Sep 17 00:00:00 2001 From: Roja Rani Yarubandi Date: Wed, 24 Mar 2021 15:48:35 +0530 Subject: soc: qcom-geni-se: Cleanup the code to remove proxy votes This reverts commit 048eb908a1f2 ("soc: qcom-geni-se: Add interconnect support to fix earlycon crash") ICC core and platforms drivers supports sync_state feature, which ensures that the default ICC BW votes from the bootloader is not removed until all it's consumers are probes. The proxy votes were needed in case other QUP child drivers I2C, SPI probes before UART, they can turn off the QUP-CORE clock which is shared resources for all QUP driver, this causes unclocked access to HW from earlycon. Given above support from ICC there is no longer need to maintain proxy votes on QUP-CORE ICC node from QUP wrapper driver for early console usecase, the default votes won't be removed until real console is probed. Cc: stable@vger.kernel.org Fixes: 266cd33b5913 ("interconnect: qcom: Ensure that the floor bandwidth value is enforced") Fixes: 7d3b0b0d8184 ("interconnect: qcom: Use icc_sync_state") Signed-off-by: Roja Rani Yarubandi Signed-off-by: Akash Asthana Reviewed-by: Matthias Kaehlcke Link: https://lore.kernel.org/r/20210324101836.25272-2-rojay@codeaurora.org Signed-off-by: Greg Kroah-Hartman --- include/linux/qcom-geni-se.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/qcom-geni-se.h b/include/linux/qcom-geni-se.h index ec2ad4b0fe14..c4fdb4463f7d 100644 --- a/include/linux/qcom-geni-se.h +++ b/include/linux/qcom-geni-se.h @@ -460,7 +460,5 @@ void geni_icc_set_tag(struct geni_se *se, u32 tag); int geni_icc_enable(struct geni_se *se); int geni_icc_disable(struct geni_se *se); - -void geni_remove_earlycon_icc_vote(void); #endif #endif -- cgit v1.2.3 From 1a1c130ab7575498eed5bcf7220037ae09cd1f8a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 23 Mar 2021 20:26:52 +0100 Subject: ACPI: tables: x86: Reserve memory occupied by ACPI tables The following problem has been reported by George Kennedy: Since commit 7fef431be9c9 ("mm/page_alloc: place pages to tail in __free_pages_core()") the following use after free occurs intermittently when ACPI tables are accessed. BUG: KASAN: use-after-free in ibft_init+0x134/0xc49 Read of size 4 at addr ffff8880be453004 by task swapper/0/1 CPU: 3 PID: 1 Comm: swapper/0 Not tainted 5.12.0-rc1-7a7fd0d #1 Call Trace: dump_stack+0xf6/0x158 print_address_description.constprop.9+0x41/0x60 kasan_report.cold.14+0x7b/0xd4 __asan_report_load_n_noabort+0xf/0x20 ibft_init+0x134/0xc49 do_one_initcall+0xc4/0x3e0 kernel_init_freeable+0x5af/0x66b kernel_init+0x16/0x1d0 ret_from_fork+0x22/0x30 ACPI tables mapped via kmap() do not have their mapped pages reserved and the pages can be "stolen" by the buddy allocator. Apparently, on the affected system, the ACPI table in question is not located in "reserved" memory, like ACPI NVS or ACPI Data, that will not be used by the buddy allocator, so the memory occupied by that table has to be explicitly reserved to prevent the buddy allocator from using it. In order to address this problem, rearrange the initialization of the ACPI tables on x86 to locate the initial tables earlier and reserve the memory occupied by them. The other architectures using ACPI should not be affected by this change. Link: https://lore.kernel.org/linux-acpi/1614802160-29362-1-git-send-email-george.kennedy@oracle.com/ Reported-by: George Kennedy Tested-by: George Kennedy Signed-off-by: Rafael J. Wysocki Reviewed-by: Mike Rapoport Cc: 5.10+ # 5.10+ --- include/linux/acpi.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index fcdaab723916..3bdcfc4401b7 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -222,10 +222,14 @@ void __iomem *__acpi_map_table(unsigned long phys, unsigned long size); void __acpi_unmap_table(void __iomem *map, unsigned long size); int early_acpi_boot_init(void); int acpi_boot_init (void); +void acpi_boot_table_prepare (void); void acpi_boot_table_init (void); int acpi_mps_check (void); int acpi_numa_init (void); +int acpi_locate_initial_tables (void); +void acpi_reserve_initial_tables (void); +void acpi_table_init_complete (void); int acpi_table_init (void); int acpi_table_parse(char *id, acpi_tbl_table_handler handler); int __init acpi_table_parse_entries(char *id, unsigned long table_size, @@ -814,9 +818,12 @@ static inline int acpi_boot_init(void) return 0; } +static inline void acpi_boot_table_prepare(void) +{ +} + static inline void acpi_boot_table_init(void) { - return; } static inline int acpi_mps_check(void) -- cgit v1.2.3 From 9e67600ed6b8565da4b85698ec659b5879a6c1c6 Mon Sep 17 00:00:00 2001 From: Gulam Mohamed Date: Thu, 25 Mar 2021 09:32:48 +0000 Subject: scsi: iscsi: Fix race condition between login and sync thread A kernel panic was observed due to a timing issue between the sync thread and the initiator processing a login response from the target. The session reopen can be invoked both from the session sync thread when iscsid restarts and from iscsid through the error handler. Before the initiator receives the response to a login, another reopen request can be sent from the error handler/sync session. When the initial login response is subsequently processed, the connection has been closed and the socket has been released. To fix this a new connection state, ISCSI_CONN_BOUND, is added: - Set the connection state value to ISCSI_CONN_DOWN upon iscsi_if_ep_disconnect() and iscsi_if_stop_conn() - Set the connection state to the newly created value ISCSI_CONN_BOUND after bind connection (transport->bind_conn()) - In iscsi_set_param(), return -ENOTCONN if the connection state is not either ISCSI_CONN_BOUND or ISCSI_CONN_UP Link: https://lore.kernel.org/r/20210325093248.284678-1-gulam.mohamed@oracle.com Reviewed-by: Mike Christie Signed-off-by: Gulam Mohamed Signed-off-by: Martin K. Petersen index 91074fd97f64..f4bf62b007a0 100644 --- include/scsi/scsi_transport_iscsi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index 8a26a2ffa952..fc5a39839b4b 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -193,6 +193,7 @@ enum iscsi_connection_state { ISCSI_CONN_UP = 0, ISCSI_CONN_DOWN, ISCSI_CONN_FAILED, + ISCSI_CONN_BOUND, }; struct iscsi_cls_conn { -- cgit v1.2.3 From a24f98176d1efae2c37d3438c57a624d530d9c33 Mon Sep 17 00:00:00 2001 From: Mikko Perttunen Date: Mon, 29 Mar 2021 16:38:27 +0300 Subject: gpu: host1x: Use different lock classes for each client To avoid false lockdep warnings, give each client lock a different lock class, passed from the initialization site by macro. Signed-off-by: Mikko Perttunen Signed-off-by: Thierry Reding --- include/linux/host1x.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/host1x.h b/include/linux/host1x.h index ce59a6a6a008..9eb77c87a83b 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -320,7 +320,14 @@ static inline struct host1x_device *to_host1x_device(struct device *dev) int host1x_device_init(struct host1x_device *device); int host1x_device_exit(struct host1x_device *device); -int host1x_client_register(struct host1x_client *client); +int __host1x_client_register(struct host1x_client *client, + struct lock_class_key *key); +#define host1x_client_register(class) \ + ({ \ + static struct lock_class_key __key; \ + __host1x_client_register(class, &__key); \ + }) + int host1x_client_unregister(struct host1x_client *client); int host1x_client_suspend(struct host1x_client *client); -- cgit v1.2.3 From df59d0a461bc5935232bf56a279e4d7a71c566a5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 30 Mar 2021 13:40:27 -0400 Subject: XArray: Add xa_limit_16b A 16-bit limit is a more common limit than I had realised. Make it generally available. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/xarray.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 92c0160b3352..a91e3d90df8a 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -229,9 +229,10 @@ static inline int xa_err(void *entry) * * This structure is used either directly or via the XA_LIMIT() macro * to communicate the range of IDs that are valid for allocation. - * Two common ranges are predefined for you: + * Three common ranges are predefined for you: * * xa_limit_32b - [0 - UINT_MAX] * * xa_limit_31b - [0 - INT_MAX] + * * xa_limit_16b - [0 - USHRT_MAX] */ struct xa_limit { u32 max; @@ -242,6 +243,7 @@ struct xa_limit { #define xa_limit_32b XA_LIMIT(0, UINT_MAX) #define xa_limit_31b XA_LIMIT(0, INT_MAX) +#define xa_limit_16b XA_LIMIT(0, USHRT_MAX) typedef unsigned __bitwise xa_mark_t; #define XA_MARK_0 ((__force xa_mark_t)0U) -- cgit v1.2.3 From b9c6cdc37ee1fe5866d3b1c10efb9d03191a76af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Apr 2021 19:17:31 +0200 Subject: block: update a few comments in uapi/linux/blkpg.h The big top of the file comment talk about grand plans that never happened, so remove them to not confuse the readers. Also mark the devname and volname fields as ignored as they were never used by the kernel. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/uapi/linux/blkpg.h | 28 ++-------------------------- 1 file changed, 2 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/blkpg.h b/include/uapi/linux/blkpg.h index ac6474e4f29d..d0a64ee97c6d 100644 --- a/include/uapi/linux/blkpg.h +++ b/include/uapi/linux/blkpg.h @@ -2,29 +2,6 @@ #ifndef _UAPI__LINUX_BLKPG_H #define _UAPI__LINUX_BLKPG_H -/* - * Partition table and disk geometry handling - * - * A single ioctl with lots of subfunctions: - * - * Device number stuff: - * get_whole_disk() (given the device number of a partition, - * find the device number of the encompassing disk) - * get_all_partitions() (given the device number of a disk, return the - * device numbers of all its known partitions) - * - * Partition stuff: - * add_partition() - * delete_partition() - * test_partition_in_use() (also for test_disk_in_use) - * - * Geometry stuff: - * get_geometry() - * set_geometry() - * get_bios_drivedata() - * - * For today, only the partition stuff - aeb, 990515 - */ #include #include @@ -52,9 +29,8 @@ struct blkpg_partition { long long start; /* starting offset in bytes */ long long length; /* length in bytes */ int pno; /* partition number */ - char devname[BLKPG_DEVNAMELTH]; /* partition name, like sda5 or c0d1p2, - to be used in kernel messages */ - char volname[BLKPG_VOLNAMELTH]; /* volume label */ + char devname[BLKPG_DEVNAMELTH]; /* unused / ignored */ + char volname[BLKPG_VOLNAMELTH]; /* unused / ignore */ }; #endif /* _UAPI__LINUX_BLKPG_H */ -- cgit v1.2.3 From f06c609645ecd043c79380fac94145926603fb33 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Apr 2021 19:17:46 +0200 Subject: block: remove the unused RQF_ALLOCED flag Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bc6bc8383b43..158aefae1030 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -85,8 +85,6 @@ typedef __u32 __bitwise req_flags_t; #define RQF_ELVPRIV ((__force req_flags_t)(1 << 12)) /* account into disk and partition IO statistics */ #define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) -/* request came from our alloc pool */ -#define RQF_ALLOCED ((__force req_flags_t)(1 << 14)) /* runtime pm request */ #define RQF_PM ((__force req_flags_t)(1 << 15)) /* on IO scheduler merge hash */ -- cgit v1.2.3