From 5f7b51bf09baca8e4f80cbe879536842bafb5f31 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@netfilter.org>
Date: Wed, 28 Jul 2021 17:01:15 +0200
Subject: netfilter: ipset: Limit the maximal range of consecutive elements to
 add/delete

The range size of consecutive elements were not limited. Thus one could
define a huge range which may result soft lockup errors due to the long
execution time. Now the range size is limited to 2^20 entries.

Reported-by: Brad Spengler <spender@grsecurity.net>
Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 10279c4830ac..ada1296c87d5 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -196,6 +196,9 @@ struct ip_set_region {
 	u32 elements;		/* Number of elements vs timeout */
 };
 
+/* Max range where every element is added/deleted in one step */
+#define IPSET_MAX_RANGE		(1<<20)
+
 /* The max revision number supported by any set type + 1 */
 #define IPSET_REVISION_MAX	9
 
-- 
cgit v1.2.3


From 1027b96ec9d34f9abab69bc1a4dc5b1ad8ab1349 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Fri, 6 Aug 2021 16:21:24 +0800
Subject: once: Fix panic when module unload

DO_ONCE
DEFINE_STATIC_KEY_TRUE(___once_key);
__do_once_done
  once_disable_jump(once_key);
    INIT_WORK(&w->work, once_deferred);
    struct once_work *w;
    w->key = key;
    schedule_work(&w->work);                     module unload
                                                   //*the key is
destroy*
process_one_work
  once_deferred
    BUG_ON(!static_key_enabled(work->key));
       static_key_count((struct static_key *)x)    //*access key, crash*

When module uses DO_ONCE mechanism, it could crash due to the above
concurrency problem, we could reproduce it with link[1].

Fix it by add/put module refcount in the once work process.

[1] https://lore.kernel.org/netdev/eaa6c371-465e-57eb-6be9-f4b16b9d7cbf@huawei.com/

Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Reported-by: Minmin chen <chenmingmin@huawei.com>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/once.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/once.h b/include/linux/once.h
index 9225ee6d96c7..ae6f4eb41cbe 100644
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -7,7 +7,7 @@
 
 bool __do_once_start(bool *done, unsigned long *flags);
 void __do_once_done(bool *done, struct static_key_true *once_key,
-		    unsigned long *flags);
+		    unsigned long *flags, struct module *mod);
 
 /* Call a function exactly once. The idea of DO_ONCE() is to perform
  * a function call such as initialization of random seeds, etc, only
@@ -46,7 +46,7 @@ void __do_once_done(bool *done, struct static_key_true *once_key,
 			if (unlikely(___ret)) {				     \
 				func(__VA_ARGS__);			     \
 				__do_once_done(&___done, &___once_key,	     \
-					       &___flags);		     \
+					       &___flags, THIS_MODULE);	     \
 			}						     \
 		}							     \
 		___ret;							     \
-- 
cgit v1.2.3


From 71330842ff93ae67a066c1fa68d75672527312fa Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 9 Aug 2021 21:45:32 +0200
Subject: bpf: Add _kernel suffix to internal lockdown_bpf_read

Rename LOCKDOWN_BPF_READ into LOCKDOWN_BPF_READ_KERNEL so we have naming
more consistent with a LOCKDOWN_BPF_WRITE_USER option that we are adding.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/security.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 24eda04221e9..724d7a4a0c91 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -123,7 +123,7 @@ enum lockdown_reason {
 	LOCKDOWN_INTEGRITY_MAX,
 	LOCKDOWN_KCORE,
 	LOCKDOWN_KPROBES,
-	LOCKDOWN_BPF_READ,
+	LOCKDOWN_BPF_READ_KERNEL,
 	LOCKDOWN_PERF,
 	LOCKDOWN_TRACEFS,
 	LOCKDOWN_XMON_RW,
-- 
cgit v1.2.3


From 563476ae0c5e48a028cbfa38fa9d2fc0418eb88f Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Sun, 11 Apr 2021 15:32:55 +0300
Subject: net/mlx5: Synchronize correct IRQ when destroying CQ

The CQ destroy is performed based on the IRQ number that is stored in
cq->irqn. That number wasn't set explicitly during CQ creation and as
expected some of the API users of mlx5_core_create_cq() forgot to update
it.

This caused to wrong synchronization call of the wrong IRQ with a number
0 instead of the real one.

As a fix, set the IRQ number directly in the mlx5_core_create_cq() and
update all users accordingly.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices")
Fixes: ef1659ade359 ("IB/mlx5: Add DEVX support for CQ events")
Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 1efe37466969..25a8be58d289 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1044,8 +1044,7 @@ void mlx5_unregister_debugfs(void);
 void mlx5_fill_page_array(struct mlx5_frag_buf *buf, __be64 *pas);
 void mlx5_fill_page_frag_array_perm(struct mlx5_frag_buf *buf, __be64 *pas, u8 perm);
 void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas);
-int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
-		    unsigned int *irqn);
+int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn);
 int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn);
 int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn);
 
-- 
cgit v1.2.3


From 51e1bb9eeaf7868db56e58f47848e364ab4c4129 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 9 Aug 2021 12:43:17 +0200
Subject: bpf: Add lockdown check for probe_write_user helper

Back then, commit 96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper
to be called in tracers") added the bpf_probe_write_user() helper in order
to allow to override user space memory. Its original goal was to have a
facility to "debug, divert, and manipulate execution of semi-cooperative
processes" under CAP_SYS_ADMIN. Write to kernel was explicitly disallowed
since it would otherwise tamper with its integrity.

One use case was shown in cf9b1199de27 ("samples/bpf: Add test/example of
using bpf_probe_write_user bpf helper") where the program DNATs traffic
at the time of connect(2) syscall, meaning, it rewrites the arguments to
a syscall while they're still in userspace, and before the syscall has a
chance to copy the argument into kernel space. These days we have better
mechanisms in BPF for achieving the same (e.g. for load-balancers), but
without having to write to userspace memory.

Of course the bpf_probe_write_user() helper can also be used to abuse
many other things for both good or bad purpose. Outside of BPF, there is
a similar mechanism for ptrace(2) such as PTRACE_PEEK{TEXT,DATA} and
PTRACE_POKE{TEXT,DATA}, but would likely require some more effort.
Commit 96ae52279594 explicitly dedicated the helper for experimentation
purpose only. Thus, move the helper's availability behind a newly added
LOCKDOWN_BPF_WRITE_USER lockdown knob so that the helper is disabled under
the "integrity" mode. More fine-grained control can be implemented also
from LSM side with this change.

Fixes: 96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper to be called in tracers")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/security.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 724d7a4a0c91..5b7288521300 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -120,6 +120,7 @@ enum lockdown_reason {
 	LOCKDOWN_MMIOTRACE,
 	LOCKDOWN_DEBUGFS,
 	LOCKDOWN_XMON_WR,
+	LOCKDOWN_BPF_WRITE_USER,
 	LOCKDOWN_INTEGRITY_MAX,
 	LOCKDOWN_KCORE,
 	LOCKDOWN_KPROBES,
-- 
cgit v1.2.3


From a2baf4e8bb0f306fbed7b5e6197c02896a638ab5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 9 Aug 2021 18:04:13 -0700
Subject: bpf: Fix potentially incorrect results with bpf_get_local_storage()

Commit b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage()
helper") fixed a bug for bpf_get_local_storage() helper so different tasks
won't mess up with each other's percpu local storage.

The percpu data contains 8 slots so it can hold up to 8 contexts (same or
different tasks), for 8 different program runs, at the same time. This in
general is sufficient. But our internal testing showed the following warning
multiple times:

  [...]
  warning: WARNING: CPU: 13 PID: 41661 at include/linux/bpf-cgroup.h:193
     __cgroup_bpf_run_filter_sock_ops+0x13e/0x180
  RIP: 0010:__cgroup_bpf_run_filter_sock_ops+0x13e/0x180
  <IRQ>
   tcp_call_bpf.constprop.99+0x93/0xc0
   tcp_conn_request+0x41e/0xa50
   ? tcp_rcv_state_process+0x203/0xe00
   tcp_rcv_state_process+0x203/0xe00
   ? sk_filter_trim_cap+0xbc/0x210
   ? tcp_v6_inbound_md5_hash.constprop.41+0x44/0x160
   tcp_v6_do_rcv+0x181/0x3e0
   tcp_v6_rcv+0xc65/0xcb0
   ip6_protocol_deliver_rcu+0xbd/0x450
   ip6_input_finish+0x11/0x20
   ip6_input+0xb5/0xc0
   ip6_sublist_rcv_finish+0x37/0x50
   ip6_sublist_rcv+0x1dc/0x270
   ipv6_list_rcv+0x113/0x140
   __netif_receive_skb_list_core+0x1a0/0x210
   netif_receive_skb_list_internal+0x186/0x2a0
   gro_normal_list.part.170+0x19/0x40
   napi_complete_done+0x65/0x150
   mlx5e_napi_poll+0x1ae/0x680
   __napi_poll+0x25/0x120
   net_rx_action+0x11e/0x280
   __do_softirq+0xbb/0x271
   irq_exit_rcu+0x97/0xa0
   common_interrupt+0x7f/0xa0
   </IRQ>
   asm_common_interrupt+0x1e/0x40
  RIP: 0010:bpf_prog_1835a9241238291a_tw_egress+0x5/0xbac
   ? __cgroup_bpf_run_filter_skb+0x378/0x4e0
   ? do_softirq+0x34/0x70
   ? ip6_finish_output2+0x266/0x590
   ? ip6_finish_output+0x66/0xa0
   ? ip6_output+0x6c/0x130
   ? ip6_xmit+0x279/0x550
   ? ip6_dst_check+0x61/0xd0
  [...]

Using drgn [0] to dump the percpu buffer contents showed that on this CPU
slot 0 is still available, but slots 1-7 are occupied and those tasks in
slots 1-7 mostly don't exist any more. So we might have issues in
bpf_cgroup_storage_unset().

Further debugging confirmed that there is a bug in bpf_cgroup_storage_unset().
Currently, it tries to unset "current" slot with searching from the start.
So the following sequence is possible:

  1. A task is running and claims slot 0
  2. Running BPF program is done, and it checked slot 0 has the "task"
     and ready to reset it to NULL (not yet).
  3. An interrupt happens, another BPF program runs and it claims slot 1
     with the *same* task.
  4. The unset() in interrupt context releases slot 0 since it matches "task".
  5. Interrupt is done, the task in process context reset slot 0.

At the end, slot 1 is not reset and the same process can continue to occupy
slots 2-7 and finally, when the above step 1-5 is repeated again, step 3 BPF
program won't be able to claim an empty slot and a warning will be issued.

To fix the issue, for unset() function, we should traverse from the last slot
to the first. This way, the above issue can be avoided.

The same reverse traversal should also be done in bpf_get_local_storage() helper
itself. Otherwise, incorrect local storage may be returned to BPF program.

  [0] https://github.com/osandov/drgn

Fixes: b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210810010413.1976277-1-yhs@fb.com
---
 include/linux/bpf-cgroup.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 8b77d08d4b47..6c9b10d82c80 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -201,8 +201,8 @@ static inline void bpf_cgroup_storage_unset(void)
 {
 	int i;
 
-	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
-		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+	for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) {
+		if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
 			continue;
 
 		this_cpu_write(bpf_cgroup_storage_info[i].task, NULL);
-- 
cgit v1.2.3


From b69dd5b3780a7298bd893816a09da751bc0636f7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 11 Aug 2021 12:57:15 -0700
Subject: net: igmp: increase size of mr_ifc_count

Some arches support cmpxchg() on 4-byte and 8-byte only.
Increase mr_ifc_count width to 32bit to fix this problem.

Fixes: 4a2b285e7e10 ("net: igmp: fix data-race in igmp_ifc_timer_expire()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20210811195715.3684218-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/inetdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 53aa0343bf69..aaf4f1b4c277 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -41,7 +41,7 @@ struct in_device {
 	unsigned long		mr_qri;		/* Query Response Interval */
 	unsigned char		mr_qrv;		/* Query Robustness Variable */
 	unsigned char		mr_gq_running;
-	unsigned char		mr_ifc_count;
+	u32			mr_ifc_count;
 	struct timer_list	mr_gq_timer;	/* general query timer */
 	struct timer_list	mr_ifc_timer;	/* interface change timer */
 
-- 
cgit v1.2.3