From 25954730461af01f66afa9e17036b051986b007e Mon Sep 17 00:00:00 2001
From: Anton Protopopov <aspsk@isovalent.com>
Date: Thu, 6 Jul 2023 13:39:28 +0000
Subject: bpf: add percpu stats for bpf_map elements insertions/deletions

Add a generic percpu stats for bpf_map elements insertions/deletions in order
to keep track of both, the current (approximate) number of elements in a map
and per-cpu statistics on update/delete operations.

To expose these stats a particular map implementation should initialize the
counter and adjust it as needed using the 'bpf_map_*_elem_count' helpers
provided by this commit.

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Link: https://lore.kernel.org/r/20230706133932.45883-2-aspsk@isovalent.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include/linux/bpf.h')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f58895830ada..360433f14496 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -275,6 +275,7 @@ struct bpf_map {
 	} owner;
 	bool bypass_spec_v1;
 	bool frozen; /* write-once; write-protected by freeze_mutex */
+	s64 __percpu *elem_count;
 };
 
 static inline const char *btf_field_type_name(enum btf_field_type type)
@@ -2040,6 +2041,35 @@ bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align,
 }
 #endif
 
+static inline int
+bpf_map_init_elem_count(struct bpf_map *map)
+{
+	size_t size = sizeof(*map->elem_count), align = size;
+	gfp_t flags = GFP_USER | __GFP_NOWARN;
+
+	map->elem_count = bpf_map_alloc_percpu(map, size, align, flags);
+	if (!map->elem_count)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static inline void
+bpf_map_free_elem_count(struct bpf_map *map)
+{
+	free_percpu(map->elem_count);
+}
+
+static inline void bpf_map_inc_elem_count(struct bpf_map *map)
+{
+	this_cpu_inc(*map->elem_count);
+}
+
+static inline void bpf_map_dec_elem_count(struct bpf_map *map)
+{
+	this_cpu_dec(*map->elem_count);
+}
+
 extern int sysctl_unprivileged_bpf_disabled;
 
 static inline bool bpf_allow_ptr_leaks(void)
-- 
cgit v1.2.3


From 0a1f7bfe35a3e1302529fa900bf0574a5dfc8ea6 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Tue, 18 Jul 2023 01:38:09 -0700
Subject: bpf: Introduce internal definitions for UAPI-opaque
 bpf_{rb,list}_node

Structs bpf_rb_node and bpf_list_node are opaquely defined in
uapi/linux/bpf.h, as BPF program writers are not expected to touch their
fields - nor does the verifier allow them to do so.

Currently these structs are simple wrappers around structs rb_node and
list_head and linked_list / rbtree implementation just casts and passes
to library functions for those data structures. Later patches in this
series, though, will add an "owner" field to bpf_{rb,list}_node, such
that they're not just wrapping an underlying node type. Moreover, the
bpf linked_list and rbtree implementations will deal with these owner
pointers directly in a few different places.

To avoid having to do

  void *owner = (void*)bpf_list_node + sizeof(struct list_head)

with opaque UAPI node types, add bpf_{list,rb}_node_kern struct
definitions to internal headers and modify linked_list and rbtree to use
the internal types where appropriate.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230718083813.3416104-3-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux/bpf.h')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 360433f14496..511ed49c3fe9 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -228,6 +228,16 @@ struct btf_record {
 	struct btf_field fields[];
 };
 
+/* Non-opaque version of bpf_rb_node in uapi/linux/bpf.h */
+struct bpf_rb_node_kern {
+	struct rb_node rb_node;
+} __attribute__((aligned(8)));
+
+/* Non-opaque version of bpf_list_node in uapi/linux/bpf.h */
+struct bpf_list_node_kern {
+	struct list_head list_head;
+} __attribute__((aligned(8)));
+
 struct bpf_map {
 	/* The first two cachelines with read-mostly members of which some
 	 * are also accessed in fast-path (e.g. ops, max_entries).
-- 
cgit v1.2.3


From c3c510ce431cd99fa10dcd50d995c8e89330ee5b Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Tue, 18 Jul 2023 01:38:10 -0700
Subject: bpf: Add 'owner' field to bpf_{list,rb}_node

As described by Kumar in [0], in shared ownership scenarios it is
necessary to do runtime tracking of {rb,list} node ownership - and
synchronize updates using this ownership information - in order to
prevent races. This patch adds an 'owner' field to struct bpf_list_node
and bpf_rb_node to implement such runtime tracking.

The owner field is a void * that describes the ownership state of a
node. It can have the following values:

  NULL           - the node is not owned by any data structure
  BPF_PTR_POISON - the node is in the process of being added to a data
                   structure
  ptr_to_root    - the pointee is a data structure 'root'
                   (bpf_rb_root / bpf_list_head) which owns this node

The field is initially NULL (set by bpf_obj_init_field default behavior)
and transitions states in the following sequence:

  Insertion: NULL -> BPF_PTR_POISON -> ptr_to_root
  Removal:   ptr_to_root -> NULL

Before a node has been successfully inserted, it is not protected by any
root's lock, and therefore two programs can attempt to add the same node
to different roots simultaneously. For this reason the intermediate
BPF_PTR_POISON state is necessary. For removal, the node is protected
by some root's lock so this intermediate hop isn't necessary.

Note that bpf_list_pop_{front,back} helpers don't need to check owner
before removing as the node-to-be-removed is not passed in as input and
is instead taken directly from the list. Do the check anyways and
WARN_ON_ONCE in this unexpected scenario.

Selftest changes in this patch are entirely mechanical: some BTF
tests have hardcoded struct sizes for structs that contain
bpf_{list,rb}_node fields, those were adjusted to account for the new
sizes. Selftest additions to validate the owner field are added in a
further patch in the series.

  [0]: https://lore.kernel.org/bpf/d7hyspcow5wtjcmw4fugdgyp3fwhljwuscp3xyut5qnwivyeru@ysdq543otzv2

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Suggested-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230718083813.3416104-4-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux/bpf.h')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 511ed49c3fe9..ceaa8c23287f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -231,11 +231,13 @@ struct btf_record {
 /* Non-opaque version of bpf_rb_node in uapi/linux/bpf.h */
 struct bpf_rb_node_kern {
 	struct rb_node rb_node;
+	void *owner;
 } __attribute__((aligned(8)));
 
 /* Non-opaque version of bpf_list_node in uapi/linux/bpf.h */
 struct bpf_list_node_kern {
 	struct list_head list_head;
+	void *owner;
 } __attribute__((aligned(8)));
 
 struct bpf_map {
-- 
cgit v1.2.3


From 6a5a148aaf14747570cc634f9cdfcb0393f5617f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 1 Aug 2023 13:13:58 +0200
Subject: bpf: fix bpf_probe_read_kernel prototype mismatch

bpf_probe_read_kernel() has a __weak definition in core.c and another
definition with an incompatible prototype in kernel/trace/bpf_trace.c,
when CONFIG_BPF_EVENTS is enabled.

Since the two are incompatible, there cannot be a shared declaration in
a header file, but the lack of a prototype causes a W=1 warning:

kernel/bpf/core.c:1638:12: error: no previous prototype for 'bpf_probe_read_kernel' [-Werror=missing-prototypes]

On 32-bit architectures, the local prototype

u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)

passes arguments in other registers as the one in bpf_trace.c

BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size,
            const void *, unsafe_ptr)

which uses 64-bit arguments in pairs of registers.

As both versions of the function are fairly simple and only really
differ in one line, just move them into a header file as an inline
function that does not add any overhead for the bpf_trace.c callers
and actually avoids a function call for the other one.

Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/all/ac25cb0f-b804-1649-3afb-1dc6138c2716@iogearbox.net/
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230801111449.185301-1-arnd@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux/bpf.h')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ceaa8c23287f..abe75063630b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2661,6 +2661,18 @@ static inline void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
+static __always_inline int
+bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr)
+{
+	int ret = -EFAULT;
+
+	if (IS_ENABLED(CONFIG_BPF_EVENTS))
+		ret = copy_from_kernel_nofault(dst, unsafe_ptr, size);
+	if (unlikely(ret < 0))
+		memset(dst, 0, size);
+	return ret;
+}
+
 void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
 			  struct btf_mod_pair *used_btfs, u32 len);
 
-- 
cgit v1.2.3


From a3c485a5d8d47af5d2d1a0e5c3b7a1ed223669f9 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 7 Aug 2023 10:59:54 +0200
Subject: bpf: Add support for bpf_get_func_ip helper for uprobe program

Adding support for bpf_get_func_ip helper for uprobe program to return
probed address for both uprobe and return uprobe.

We discussed this in [1] and agreed that uprobe can have special use
of bpf_get_func_ip helper that differs from kprobe.

The kprobe bpf_get_func_ip returns:
  - address of the function if probe is attach on function entry
    for both kprobe and return kprobe
  - 0 if the probe is not attach on function entry

The uprobe bpf_get_func_ip returns:
  - address of the probe for both uprobe and return uprobe

The reason for this semantic change is that kernel can't really tell
if the probe user space address is function entry.

The uprobe program is actually kprobe type program attached as uprobe.
One of the consequences of this design is that uprobes do not have its
own set of helpers, but share them with kprobes.

As we need different functionality for bpf_get_func_ip helper for uprobe,
I'm adding the bool value to the bpf_trace_run_ctx, so the helper can
detect that it's executed in uprobe context and call specific code.

The is_uprobe bool is set as true in bpf_prog_run_array_sleepable, which
is currently used only for executing bpf programs in uprobe.

Renaming bpf_prog_run_array_sleepable to bpf_prog_run_array_uprobe
to address that it's only used for uprobes and that it sets the
run_ctx.is_uprobe as suggested by Yafang Shao.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
[1] https://lore.kernel.org/bpf/CAEf4BzZ=xLVkG5eurEuvLU79wAMtwho7ReR+XJAgwhFF4M-7Cg@mail.gmail.com/
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Viktor Malik <vmalik@redhat.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230807085956.2344866-2-jolsa@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux/bpf.h')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index abe75063630b..db3fe5a61b05 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1819,6 +1819,7 @@ struct bpf_cg_run_ctx {
 struct bpf_trace_run_ctx {
 	struct bpf_run_ctx run_ctx;
 	u64 bpf_cookie;
+	bool is_uprobe;
 };
 
 struct bpf_tramp_run_ctx {
@@ -1867,6 +1868,8 @@ bpf_prog_run_array(const struct bpf_prog_array *array,
 	if (unlikely(!array))
 		return ret;
 
+	run_ctx.is_uprobe = false;
+
 	migrate_disable();
 	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
 	item = &array->items[0];
@@ -1891,8 +1894,8 @@ bpf_prog_run_array(const struct bpf_prog_array *array,
  * rcu-protected dynamically sized maps.
  */
 static __always_inline u32
-bpf_prog_run_array_sleepable(const struct bpf_prog_array __rcu *array_rcu,
-			     const void *ctx, bpf_prog_run_fn run_prog)
+bpf_prog_run_array_uprobe(const struct bpf_prog_array __rcu *array_rcu,
+			  const void *ctx, bpf_prog_run_fn run_prog)
 {
 	const struct bpf_prog_array_item *item;
 	const struct bpf_prog *prog;
@@ -1906,6 +1909,8 @@ bpf_prog_run_array_sleepable(const struct bpf_prog_array __rcu *array_rcu,
 	rcu_read_lock_trace();
 	migrate_disable();
 
+	run_ctx.is_uprobe = true;
+
 	array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held());
 	if (unlikely(!array))
 		goto out;
-- 
cgit v1.2.3


From 01b8539655635288dcd46366806abfacbb9b1f6c Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Wed, 9 Aug 2023 22:05:56 +0800
Subject: bpf: Remove unused declaration bpf_link_new_file()

Commit a3b80e107894 ("bpf: Allocate ID for bpf_link")
removed the implementation but not the declaration.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20230809140556.45836-1-yuehaibing@huawei.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux/bpf.h')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index db3fe5a61b05..cfabbcf47bdb 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2120,7 +2120,6 @@ void bpf_link_cleanup(struct bpf_link_primer *primer);
 void bpf_link_inc(struct bpf_link *link);
 void bpf_link_put(struct bpf_link *link);
 int bpf_link_new_fd(struct bpf_link *link);
-struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd);
 struct bpf_link *bpf_link_get_from_fd(u32 ufd);
 struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
 
-- 
cgit v1.2.3


From bb48cf1679d294d4fd3bfaa88289ed9004cbb025 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 14 Aug 2023 13:59:08 -0500
Subject: bpf: Document struct bpf_struct_ops fields

Subsystems that want to implement a struct bpf_struct_ops structure to
enable struct_ops maps must currently reverse engineer how the structure
works. Given that this is meant to be a way for subsystem maintainers to
extend their subsystems using BPF, let's document it to make it a bit
easier on them.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230814185908.700553-3-void@manifault.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'include/linux/bpf.h')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cfabbcf47bdb..eced6400f778 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1550,6 +1550,53 @@ struct bpf_struct_ops_value;
 struct btf_member;
 
 #define BPF_STRUCT_OPS_MAX_NR_MEMBERS 64
+/**
+ * struct bpf_struct_ops - A structure of callbacks allowing a subsystem to
+ *			   define a BPF_MAP_TYPE_STRUCT_OPS map type composed
+ *			   of BPF_PROG_TYPE_STRUCT_OPS progs.
+ * @verifier_ops: A structure of callbacks that are invoked by the verifier
+ *		  when determining whether the struct_ops progs in the
+ *		  struct_ops map are valid.
+ * @init: A callback that is invoked a single time, and before any other
+ *	  callback, to initialize the structure. A nonzero return value means
+ *	  the subsystem could not be initialized.
+ * @check_member: When defined, a callback invoked by the verifier to allow
+ *		  the subsystem to determine if an entry in the struct_ops map
+ *		  is valid. A nonzero return value means that the map is
+ *		  invalid and should be rejected by the verifier.
+ * @init_member: A callback that is invoked for each member of the struct_ops
+ *		 map to allow the subsystem to initialize the member. A nonzero
+ *		 value means the member could not be initialized. This callback
+ *		 is exclusive with the @type, @type_id, @value_type, and
+ *		 @value_id fields.
+ * @reg: A callback that is invoked when the struct_ops map has been
+ *	 initialized and is being attached to. Zero means the struct_ops map
+ *	 has been successfully registered and is live. A nonzero return value
+ *	 means the struct_ops map could not be registered.
+ * @unreg: A callback that is invoked when the struct_ops map should be
+ *	   unregistered.
+ * @update: A callback that is invoked when the live struct_ops map is being
+ *	    updated to contain new values. This callback is only invoked when
+ *	    the struct_ops map is loaded with BPF_F_LINK. If not defined, the
+ *	    it is assumed that the struct_ops map cannot be updated.
+ * @validate: A callback that is invoked after all of the members have been
+ *	      initialized. This callback should perform static checks on the
+ *	      map, meaning that it should either fail or succeed
+ *	      deterministically. A struct_ops map that has been validated may
+ *	      not necessarily succeed in being registered if the call to @reg
+ *	      fails. For example, a valid struct_ops map may be loaded, but
+ *	      then fail to be registered due to there being another active
+ *	      struct_ops map on the system in the subsystem already. For this
+ *	      reason, if this callback is not defined, the check is skipped as
+ *	      the struct_ops map will have final verification performed in
+ *	      @reg.
+ * @type: BTF type.
+ * @value_type: Value type.
+ * @name: The name of the struct bpf_struct_ops object.
+ * @func_models: Func models
+ * @type_id: BTF type id.
+ * @value_id: BTF value id.
+ */
 struct bpf_struct_ops {
 	const struct bpf_verifier_ops *verifier_ops;
 	int (*init)(struct btf *btf);
-- 
cgit v1.2.3


From 0816b8c6bf7fc87cec4273dc199e8f0764b9e7b1 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Mon, 21 Aug 2023 12:33:09 -0700
Subject: bpf: Consider non-owning refs to refcounted nodes RCU protected

An earlier patch in the series ensures that the underlying memory of
nodes with bpf_refcount - which can have multiple owners - is not reused
until RCU grace period has elapsed. This prevents
use-after-free with non-owning references that may point to
recently-freed memory. While RCU read lock is held, it's safe to
dereference such a non-owning ref, as by definition RCU GP couldn't have
elapsed and therefore underlying memory couldn't have been reused.

From the perspective of verifier "trustedness" non-owning refs to
refcounted nodes are now trusted only in RCU CS and therefore should no
longer pass is_trusted_reg, but rather is_rcu_reg. Let's mark them
MEM_RCU in order to reflect this new state.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230821193311.3290257-6-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux/bpf.h')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index eced6400f778..12596af59c00 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -653,7 +653,8 @@ enum bpf_type_flag {
 	MEM_RCU			= BIT(13 + BPF_BASE_TYPE_BITS),
 
 	/* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning.
-	 * Currently only valid for linked-list and rbtree nodes.
+	 * Currently only valid for linked-list and rbtree nodes. If the nodes
+	 * have a bpf_refcount_field, they must be tagged MEM_RCU as well.
 	 */
 	NON_OWN_REF		= BIT(14 + BPF_BASE_TYPE_BITS),
 
-- 
cgit v1.2.3