From 215bf4962f6c9605710012fad222a5fec001b3ad Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 8 Mar 2023 10:41:15 -0800
Subject: bpf: add iterator kfuncs registration and validation logic

Add ability to register kfuncs that implement BPF open-coded iterator
contract and enforce naming and function proto convention. Enforcement
happens at the time of kfunc registration and significantly simplifies
the rest of iterators logic in the verifier.

More details follow in subsequent patches, but we enforce the following
conditions.

All kfuncs (constructor, next, destructor) have to be named consistenly
as bpf_iter_<type>_{new,next,destroy}(), respectively. <type> represents
iterator type, and iterator state should be represented as a matching
`struct bpf_iter_<type>` state type. Also, all iter kfuncs should have
a pointer to this `struct bpf_iter_<type>` as the very first argument.

Additionally:
  - Constructor, i.e., bpf_iter_<type>_new(), can have arbitrary extra
  number of arguments. Return type is not enforced either.
  - Next method, i.e., bpf_iter_<type>_next(), has to return a pointer
  type and should have exactly one argument: `struct bpf_iter_<type> *`
  (const/volatile/restrict and typedefs are ignored).
  - Destructor, i.e., bpf_iter_<type>_destroy(), should return void and
  should have exactly one argument, similar to the next method.
  - struct bpf_iter_<type> size is enforced to be positive and
  a multiple of 8 bytes (to fit stack slots correctly).

Such strictness and consistency allows to build generic helpers
abstracting important, but boilerplate, details to be able to use
open-coded iterators effectively and ergonomically (see bpf_for_each()
in subsequent patches). It also simplifies the verifier logic in some
places. At the same time, this doesn't hurt generality of possible
iterator implementations. Win-win.

Constructor kfunc is marked with a new KF_ITER_NEW flags, next method is
marked with KF_ITER_NEXT (and should also have KF_RET_NULL, of course),
while destructor kfunc is marked as KF_ITER_DESTROY.

Additionally, we add a trivial kfunc name validation: it should be
a valid non-NULL and non-empty string.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230308184121.1165081-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 2 ++
 include/linux/btf.h          | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 18538bad2b8c..e2dc7f064449 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -59,6 +59,8 @@ struct bpf_active_lock {
 	u32 id;
 };
 
+#define ITER_PREFIX "bpf_iter_"
+
 struct bpf_reg_state {
 	/* Ordering of fields matters.  See states_equal() */
 	enum bpf_reg_type type;
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 556b3e2e7471..1bba0827e8c4 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -71,6 +71,10 @@
 #define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
 #define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
 #define KF_RCU          (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
+/* only one of KF_ITER_{NEW,NEXT,DESTROY} could be specified per kfunc */
+#define KF_ITER_NEW     (1 << 8) /* kfunc implements BPF iter constructor */
+#define KF_ITER_NEXT    (1 << 9) /* kfunc implements BPF iter next method */
+#define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */
 
 /*
  * Tag marking a kernel function as a kfunc. This is meant to minimize the
-- 
cgit v1.2.3


From 06accc8779c1d558a5b5a21f2ac82b0c95827ddd Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 8 Mar 2023 10:41:16 -0800
Subject: bpf: add support for open-coded iterator loops

Teach verifier about the concept of the open-coded (or inline) iterators.

This patch adds generic iterator loop verification logic, new STACK_ITER
stack slot type to contain iterator state, and necessary kfunc plumbing
for iterator's constructor, destructor and next methods. Next patch
implements first specific iterator (numbers iterator for implementing
for() loop logic). Such split allows to have more focused commits for
verifier logic and separate commit that we could point later to
demonstrating  what does it take to add a new kind of iterator.

Each kind of iterator has its own associated struct bpf_iter_<type>,
where <type> denotes a specific type of iterator. struct bpf_iter_<type>
state is supposed to live on BPF program stack, so there will be no way
to change its size later on without breaking backwards compatibility, so
choose wisely! But given this struct is specific to a given <type> of
iterator, this allows a lot of flexibility: simple iterators could be
fine with just one stack slot (8 bytes), like numbers iterator in the
next patch, while some other more complicated iterators might need way
more to keep their iterator state. Either way, such design allows to
avoid runtime memory allocations, which otherwise would be necessary if
we fixed on-the-stack size and it turned out to be too small for a given
iterator implementation.

The way BPF verifier logic is implemented, there are no artificial
restrictions on a number of active iterators, it should work correctly
using multiple active iterators at the same time. This also means you
can have multiple nested iteration loops. struct bpf_iter_<type>
reference can be safely passed to subprograms as well.

General flow is easiest to demonstrate with a simple example using
number iterator implemented in next patch. Here's the simplest possible
loop:

  struct bpf_iter_num it;
  int *v;

  bpf_iter_num_new(&it, 2, 5);
  while ((v = bpf_iter_num_next(&it))) {
      bpf_printk("X = %d", *v);
  }
  bpf_iter_num_destroy(&it);

Above snippet should output "X = 2", "X = 3", "X = 4". Note that 5 is
exclusive and is not returned. This matches similar APIs (e.g., slices
in Go or Rust) that implement a range of elements, where end index is
non-inclusive.

In the above example, we see a trio of function:
  - constructor, bpf_iter_num_new(), which initializes iterator state
  (struct bpf_iter_num it) on the stack. If any of the input arguments
  are invalid, constructor should make sure to still initialize it such
  that subsequent bpf_iter_num_next() calls will return NULL. I.e., on
  error, return error and construct empty iterator.
  - next method, bpf_iter_num_next(), which accepts pointer to iterator
  state and produces an element. Next method should always return
  a pointer. The contract between BPF verifier is that next method will
  always eventually return NULL when elements are exhausted. Once NULL is
  returned, subsequent next calls should keep returning NULL. In the
  case of numbers iterator, bpf_iter_num_next() returns a pointer to an int
  (storage for this integer is inside the iterator state itself),
  which can be dereferenced after corresponding NULL check.
  - once done with the iterator, it's mandated that user cleans up its
  state with the call to destructor, bpf_iter_num_destroy() in this
  case. Destructor frees up any resources and marks stack space used by
  struct bpf_iter_num as usable for something else.

Any other iterator implementation will have to implement at least these
three methods. It is enforced that for any given type of iterator only
applicable constructor/destructor/next are callable. I.e., verifier
ensures you can't pass number iterator state into, say, cgroup
iterator's next method.

It is important to keep the naming pattern consistent to be able to
create generic macros to help with BPF iter usability. E.g., one
of the follow up patches adds generic bpf_for_each() macro to bpf_misc.h
in selftests, which allows to utilize iterator "trio" nicely without
having to code the above somewhat tedious loop explicitly every time.
This is enforced at kfunc registration point by one of the previous
patches in this series.

At the implementation level, iterator state tracking for verification
purposes is very similar to dynptr. We add STACK_ITER stack slot type,
reserve necessary number of slots, depending on
sizeof(struct bpf_iter_<type>), and keep track of necessary extra state
in the "main" slot, which is marked with non-zero ref_obj_id. Other
slots are also marked as STACK_ITER, but have zero ref_obj_id. This is
simpler than having a separate "is_first_slot" flag.

Another big distinction is that STACK_ITER is *always refcounted*, which
simplifies implementation without sacrificing usability. So no need for
extra "iter_id", no need to anticipate reuse of STACK_ITER slots for new
constructors, etc. Keeping it simple here.

As far as the verification logic goes, there are two extensive comments:
in process_iter_next_call() and iter_active_depths_differ() explaining
some important and sometimes subtle aspects. Please refer to them for
details.

But from 10,000-foot point of view, next methods are the points of
forking a verification state, which are conceptually similar to what
verifier is doing when validating conditional jump. We branch out at
a `call bpf_iter_<type>_next` instruction and simulate two outcomes:
NULL (iteration is done) and non-NULL (new element is returned). NULL is
simulated first and is supposed to reach exit without looping. After
that non-NULL case is validated and it either reaches exit (for trivial
examples with no real loop), or reaches another `call bpf_iter_<type>_next`
instruction with the state equivalent to already (partially) validated
one. State equivalency at that point means we technically are going to
be looping forever without "breaking out" out of established "state
envelope" (i.e., subsequent iterations don't add any new knowledge or
constraints to the verifier state, so running 1, 2, 10, or a million of
them doesn't matter). But taking into account the contract stating that
iterator next method *has to* return NULL eventually, we can conclude
that loop body is safe and will eventually terminate. Given we validated
logic outside of the loop (NULL case), and concluded that loop body is
safe (though potentially looping many times), verifier can claim safety
of the overall program logic.

The rest of the patch is necessary plumbing for state tracking, marking,
validation, and necessary further kfunc plumbing to allow implementing
iterator constructor, destructor, and next methods.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230308184121.1165081-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e2dc7f064449..0c052bc79940 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -61,6 +61,12 @@ struct bpf_active_lock {
 
 #define ITER_PREFIX "bpf_iter_"
 
+enum bpf_iter_state {
+	BPF_ITER_STATE_INVALID, /* for non-first slot */
+	BPF_ITER_STATE_ACTIVE,
+	BPF_ITER_STATE_DRAINED,
+};
+
 struct bpf_reg_state {
 	/* Ordering of fields matters.  See states_equal() */
 	enum bpf_reg_type type;
@@ -105,6 +111,18 @@ struct bpf_reg_state {
 			bool first_slot;
 		} dynptr;
 
+		/* For bpf_iter stack slots */
+		struct {
+			/* BTF container and BTF type ID describing
+			 * struct bpf_iter_<type> of an iterator state
+			 */
+			struct btf *btf;
+			u32 btf_id;
+			/* packing following two fields to fit iter state into 16 bytes */
+			enum bpf_iter_state state:2;
+			int depth:30;
+		} iter;
+
 		/* Max size from any of the above. */
 		struct {
 			unsigned long raw1;
@@ -143,6 +161,8 @@ struct bpf_reg_state {
 	 * same reference to the socket, to determine proper reference freeing.
 	 * For stack slots that are dynptrs, this is used to track references to
 	 * the dynptr to determine proper reference freeing.
+	 * Similarly to dynptrs, we use ID to track "belonging" of a reference
+	 * to a specific instance of bpf_iter.
 	 */
 	u32 id;
 	/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
@@ -213,9 +233,11 @@ enum bpf_stack_slot_type {
 	 * is stored in bpf_stack_state->spilled_ptr.dynptr.type
 	 */
 	STACK_DYNPTR,
+	STACK_ITER,
 };
 
 #define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
+
 #define BPF_DYNPTR_SIZE		sizeof(struct bpf_dynptr_kern)
 #define BPF_DYNPTR_NR_SLOTS		(BPF_DYNPTR_SIZE / BPF_REG_SIZE)
 
@@ -450,6 +472,7 @@ struct bpf_insn_aux_data {
 	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
 	bool zext_dst; /* this insn zero extends dst reg */
 	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
+	bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
 	u8 alu_state; /* used in combination with alu_limit */
 
 	/* below fields are initialized once */
-- 
cgit v1.2.3


From 6018e1f407cccf39b804d1f75ad4de7be4e6cc45 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 8 Mar 2023 10:41:17 -0800
Subject: bpf: implement numbers iterator

Implement the first open-coded iterator type over a range of integers.

It's public API consists of:
  - bpf_iter_num_new() constructor, which accepts [start, end) range
    (that is, start is inclusive, end is exclusive).
  - bpf_iter_num_next() which will keep returning read-only pointer to int
    until the range is exhausted, at which point NULL will be returned.
    If bpf_iter_num_next() is kept calling after this, NULL will be
    persistently returned.
  - bpf_iter_num_destroy() destructor, which needs to be called at some
    point to clean up iterator state. BPF verifier enforces that iterator
    destructor is called at some point before BPF program exits.

Note that `start = end = X` is a valid combination to setup an empty
iterator. bpf_iter_num_new() will return 0 (success) for any such
combination.

If bpf_iter_num_new() detects invalid combination of input arguments, it
returns error, resets iterator state to, effectively, empty iterator, so
any subsequent call to bpf_iter_num_next() will keep returning NULL.

BPF verifier has no knowledge that returned integers are in the
[start, end) value range, as both `start` and `end` are not statically
known and enforced: they are runtime values.

While the implementation is pretty trivial, some care needs to be taken
to avoid overflows and underflows. Subsequent selftests will validate
correctness of [start, end) semantics, especially around extremes
(INT_MIN and INT_MAX).

Similarly to bpf_loop(), we enforce that no more than BPF_MAX_LOOPS can
be specified.

bpf_iter_num_{new,next,destroy}() is a logical evolution from bounded
BPF loops and bpf_loop() helper and is the basis for implementing
ergonomic BPF loops with no statically known or verified bounds.
Subsequent patches implement bpf_for() macro, demonstrating how this can
be wrapped into something that works and feels like a normal for() loop
in C language.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230308184121.1165081-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6792a7940e1e..e64ff1e89fb2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1617,8 +1617,12 @@ struct bpf_array {
 #define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
 #define MAX_TAIL_CALL_CNT 33
 
-/* Maximum number of loops for bpf_loop */
-#define BPF_MAX_LOOPS	BIT(23)
+/* Maximum number of loops for bpf_loop and bpf_iter_num.
+ * It's enum to expose it (and thus make it discoverable) through BTF.
+ */
+enum {
+	BPF_MAX_LOOPS = 8 * 1024 * 1024,
+};
 
 #define BPF_F_ACCESS_MASK	(BPF_F_RDONLY |		\
 				 BPF_F_RDONLY_PROG |	\
-- 
cgit v1.2.3


From 4b5ce570dbef57a20acdd71b0c65376009012354 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Mar 2023 22:01:49 -0800
Subject: bpf: ensure state checkpointing at iter_next() call sites

State equivalence check and checkpointing performed in is_state_visited()
employs certain heuristics to try to save memory by avoiding state checkpoints
if not enough jumps and instructions happened since last checkpoint. This leads
to unpredictability of whether a particular instruction will be checkpointed
and how regularly. While normally this is not causing much problems (except
inconveniences for predictable verifier tests, which we overcome with
BPF_F_TEST_STATE_FREQ flag), turns out it's not the case for open-coded
iterators.

Checking and saving state checkpoints at iter_next() call is crucial for fast
convergence of open-coded iterator loop logic, so we need to force it. If we
don't do that, is_state_visited() might skip saving a checkpoint, causing
unnecessarily long sequence of not checkpointed instructions and jumps, leading
to exhaustion of jump history buffer, and potentially other undesired outcomes.
It is expected that with correct open-coded iterators convergence will happen
quickly, so we don't run a risk of exhausting memory.

This patch adds, in addition to prune and jump instruction marks, also a
"forced checkpoint" mark, and makes sure that any iter_next() call instruction
is marked as such.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230310060149.625887-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 0c052bc79940..81d525d057c7 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -477,8 +477,12 @@ struct bpf_insn_aux_data {
 
 	/* below fields are initialized once */
 	unsigned int orig_idx; /* original instruction index */
-	bool prune_point;
 	bool jmp_point;
+	bool prune_point;
+	/* ensure we check state equivalence and save state checkpoint and
+	 * this instruction, regardless of any heuristics
+	 */
+	bool force_checkpoint;
 };
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
-- 
cgit v1.2.3


From 4cbd23cc92c49173e402753cab62b8a7754ed18f Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:20 -0800
Subject: bpf: Move a few bpf_local_storage functions to static scope

This patch moves the bpf_local_storage_free_rcu() and
bpf_selem_unlink_map() to static because they are
not used outside of bpf_local_storage.c.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-2-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index d934248b8e81..502ad7093f13 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -147,8 +147,6 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu);
 void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 			struct bpf_local_storage_elem *selem);
 
-void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem);
-
 struct bpf_local_storage_elem *
 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
 		bool charge_mem, gfp_t gfp_flags);
@@ -163,7 +161,6 @@ struct bpf_local_storage_data *
 bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 			 void *value, u64 map_flags, gfp_t gfp_flags);
 
-void bpf_local_storage_free_rcu(struct rcu_head *rcu);
 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map);
 
 #endif /* _BPF_LOCAL_STORAGE_H */
-- 
cgit v1.2.3


From 2ffcb6fc50174d1efc8f98633eb2647d84483c68 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:21 -0800
Subject: bpf: Refactor codes into bpf_local_storage_destroy

This patch first renames bpf_local_storage_unlink_nolock to
bpf_local_storage_destroy(). It better reflects that it is only
used when the storage's owner (sk/task/cgrp/inode) is being kfree().

All bpf_local_storage_destroy's caller is taking the spin lock and
then free the storage. This patch also moves these two steps into
the bpf_local_storage_destroy.

This is a preparation work for a later patch that uses
bpf_mem_cache_alloc/free in the bpf_local_storage.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-3-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 502ad7093f13..5908a954ddc2 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -128,7 +128,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
 			 struct bpf_local_storage_map *smap,
 			 bool cacheit_lockit);
 
-bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage);
+void bpf_local_storage_destroy(struct bpf_local_storage *local_storage);
 
 void bpf_local_storage_map_free(struct bpf_map *map,
 				struct bpf_local_storage_cache *cache,
-- 
cgit v1.2.3


From fc6652aab6ad545de70b772550da9043d0b47f1c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:24 -0800
Subject: bpf: Remember smap in bpf_local_storage

This patch remembers which smap triggers the allocation
of a 'struct bpf_local_storage' object. The local_storage is
allocated during the very first selem added to the owner.
The smap pointer is needed when using the bpf_mem_cache_free
in a later patch because it needs to free to the correct
smap's bpf_mem_alloc object.

When a selem is being removed, it needs to check if it is
the selem that triggers the creation of the local_storage.
If it is, the local_storage->smap pointer will be reset to NULL.
This NULL reset is done under the local_storage->lock in
bpf_selem_unlink_storage_nolock() when a selem is being removed.
Also note that the local_storage may not go away even
local_storage->smap is NULL because there may be other
selem still stored in the local_storage.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-6-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 5908a954ddc2..613b1805ed9f 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -83,6 +83,7 @@ struct bpf_local_storage_elem {
 
 struct bpf_local_storage {
 	struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE];
+	struct bpf_local_storage_map __rcu *smap;
 	struct hlist_head list; /* List of bpf_local_storage_elem */
 	void *owner;		/* The object that owns the above "list" of
 				 * bpf_local_storage_elem.
-- 
cgit v1.2.3


From a47eabf216f77cb6f22ceb38d46f1bb95968579c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:25 -0800
Subject: bpf: Repurpose use_trace_rcu to reuse_now in bpf_local_storage

This patch re-purpose the use_trace_rcu to mean
if the freed memory can be reused immediately or not.
The use_trace_rcu is renamed to reuse_now. Other than
the boolean test is reversed, it should be a no-op.

The following explains the reason for the rename and how it will
be used in a later patch.

In a later patch, bpf_mem_cache_alloc/free will be used
in the bpf_local_storage. The bpf mem allocator will reuse
the freed memory immediately. Some of the free paths in
bpf_local_storage does not support memory to be reused immediately.
These paths are the "delete" elem cases from the bpf_*_storage_delete()
helper and the map_delete_elem() syscall. Note that "delete" elem
before the owner's (sk/task/cgrp/inode) lifetime ended is not
the common usage for the local storage.

The common free path, bpf_local_storage_destroy(), can reuse the
memory immediately. This common path means the storage stays with
its owner until the owner is destroyed.

The above mentioned "delete" elem paths that cannot
reuse immediately always has the 'use_trace_rcu ==  true'.
The cases that is safe for immediate reuse always have
'use_trace_rcu == false'. Instead of adding another arg
in a later patch, this patch re-purpose this arg
to reuse_now and have the test logic reversed.

In a later patch, 'reuse_now == true' will free to the
bpf_mem_cache_free() where the memory can be reused
immediately. 'reuse_now == false' will go through the
call_rcu_tasks_trace().

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-7-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 613b1805ed9f..18a31add2255 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -143,7 +143,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
 				   struct bpf_local_storage_elem *selem);
 
-void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu);
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now);
 
 void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 			struct bpf_local_storage_elem *selem);
-- 
cgit v1.2.3


From c0d63f309186d8492577c67c67984c714b6b72bc Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:28 -0800
Subject: bpf: Add bpf_selem_free()

This patch refactors the selem freeing logic into bpf_selem_free().
It is a preparation work for a later patch using
bpf_mem_cache_alloc/free. The other kfree(selem) cases
are also changed to bpf_selem_free(..., reuse_now = true).

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-10-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 18a31add2255..a34f61467a2f 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -152,6 +152,10 @@ struct bpf_local_storage_elem *
 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
 		bool charge_mem, gfp_t gfp_flags);
 
+void bpf_selem_free(struct bpf_local_storage_elem *selem,
+		    struct bpf_local_storage_map *smap,
+		    bool reuse_now);
+
 int
 bpf_local_storage_alloc(void *owner,
 			struct bpf_local_storage_map *smap,
-- 
cgit v1.2.3


From 74843b57ec70af7b67b7e6153374834ee18d139f Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Thu, 9 Mar 2023 10:01:08 -0800
Subject: bpf: Change btf_record_find enum parameter to field_mask

btf_record_find's 3rd parameter can be multiple enum btf_field_type's
masked together. The function is called with BPF_KPTR in two places in
verifier.c, so it works with masked values already.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230309180111.1618459-4-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e64ff1e89fb2..3a38db315f7f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1925,7 +1925,7 @@ void bpf_prog_free_id(struct bpf_prog *prog);
 void bpf_map_free_id(struct bpf_map *map);
 
 struct btf_field *btf_record_find(const struct btf_record *rec,
-				  u32 offset, enum btf_field_type type);
+				  u32 offset, u32 field_mask);
 void btf_record_free(struct btf_record *rec);
 void bpf_map_free_record(struct bpf_map *map);
 struct btf_record *btf_record_dup(const struct btf_record *rec);
-- 
cgit v1.2.3


From c8e18754091479fac3f5b6c053c6bc4be0b7fb11 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Fri, 10 Mar 2023 15:07:41 -0800
Subject: bpf: Support __kptr to local kptrs

If a PTR_TO_BTF_ID type comes from program BTF - not vmlinux or module
BTF - it must have been allocated by bpf_obj_new and therefore must be
free'd with bpf_obj_drop. Such a PTR_TO_BTF_ID is considered a "local
kptr" and is tagged with MEM_ALLOC type tag by bpf_obj_new.

This patch adds support for treating __kptr-tagged pointers to "local
kptrs" as having an implicit bpf_obj_drop destructor for referenced kptr
acquire / release semantics. Consider the following example:

  struct node_data {
          long key;
          long data;
          struct bpf_rb_node node;
  };

  struct map_value {
          struct node_data __kptr *node;
  };

  struct {
          __uint(type, BPF_MAP_TYPE_ARRAY);
          __type(key, int);
          __type(value, struct map_value);
          __uint(max_entries, 1);
  } some_nodes SEC(".maps");

If struct node_data had a matching definition in kernel BTF, the verifier would
expect a destructor for the type to be registered. Since struct node_data does
not match any type in kernel BTF, the verifier knows that there is no kfunc
that provides a PTR_TO_BTF_ID to this type, and that such a PTR_TO_BTF_ID can
only come from bpf_obj_new. So instead of searching for a registered dtor,
a bpf_obj_drop dtor can be assumed.

This allows the runtime to properly destruct such kptrs in
bpf_obj_free_fields, which enables maps to clean up map_vals w/ such
kptrs when going away.

Implementation notes:
  * "kernel_btf" variable is renamed to "kptr_btf" in btf_parse_kptr.
    Before this patch, the variable would only ever point to vmlinux or
    module BTFs, but now it can point to some program BTF for local kptr
    type. It's later used to populate the (btf, btf_id) pair in kptr btf
    field.
  * It's necessary to btf_get the program BTF when populating btf_field
    for local kptr. btf_record_free later does a btf_put.
  * Behavior for non-local referenced kptrs is not modified, as
    bpf_find_btf_id helper only searches vmlinux and module BTFs for
    matching BTF type. If such a type is found, btf_field_kptr's btf will
    pass btf_is_kernel check, and the associated release function is
    some one-argument dtor. If btf_is_kernel check fails, associated
    release function is two-arg bpf_obj_drop_impl. Before this patch
    only btf_field_kptr's w/ kernel or module BTFs were created.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230310230743.2320707-2-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 11 ++++++++++-
 include/linux/btf.h |  2 --
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3a38db315f7f..756b85f0d0d3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -189,10 +189,19 @@ enum btf_field_type {
 				 BPF_RB_NODE | BPF_RB_ROOT,
 };
 
+typedef void (*btf_dtor_kfunc_t)(void *);
+typedef void (*btf_dtor_obj_drop)(void *, const struct btf_record *);
+
 struct btf_field_kptr {
 	struct btf *btf;
 	struct module *module;
-	btf_dtor_kfunc_t dtor;
+	union {
+		/* dtor used if btf_is_kernel(btf), otherwise the type
+		 * is program-allocated and obj_drop is used
+		 */
+		btf_dtor_kfunc_t dtor;
+		btf_dtor_obj_drop obj_drop;
+	};
 	u32 btf_id;
 };
 
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 1bba0827e8c4..d53b10cc55f2 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -121,8 +121,6 @@ struct btf_struct_metas {
 	struct btf_struct_meta types[];
 };
 
-typedef void (*btf_dtor_kfunc_t)(void *);
-
 extern const struct file_operations btf_fops;
 
 void btf_get(struct btf *btf);
-- 
cgit v1.2.3


From 22df776a9a866713d9decfb92b633bcfdb571954 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 15 Feb 2023 17:30:33 -0600
Subject: tasks: Extract rcu_users out of union

In commit 3fbd7ee285b2b ("tasks: Add a count of task RCU users"), a
count on the number of RCU users was added to struct task_struct. This
was done so as to enable the removal of task_rcu_dereference(), and
allow tasks to be protected by RCU even after exiting and being removed
from the runqueue. In this commit, the 'refcount_t rcu_users' field that
keeps track of this refcount was put into a union co-located with
'struct rcu_head rcu', so as to avoid taking up any extra space in
task_struct. This was possible to do safely, because the field was only
ever decremented by a static set of specific callers, and then never
incremented again.

While this restriction of there only being a small, static set of users
of this field has worked fine, it prevents us from leveraging the field
to use RCU to protect tasks in other contexts.

During tracing, for example, it would be useful to be able to collect
some tasks that performed a certain operation, put them in a map, and
then periodically summarize who they are, which cgroup they're in, how
much CPU time they've utilized, etc. While this can currently be done
with 'usage', it becomes tricky when a task is already in a map, or if a
reference should only be taken if a task is valid and will not soon be
reaped. Ideally, we could do something like pass a reference to a map
value, and then try to acquire a reference to the task in an RCU read
region by using refcount_inc_not_zero().

Similarly, in sched_ext, schedulers are using integer pids to remember
tasks, and then looking them up with find_task_by_pid_ns(). This is
slow, error prone, and adds complexity. It would be more convenient and
performant if BPF schedulers could instead store tasks directly in maps,
and then leverage RCU to ensure they can be safely accessed with low
overhead.

Finally, overloading fields like this is error prone. Someone that wants
to use 'rcu_users' could easily overlook the fact that once the rcu
callback is scheduled, the refcount will go back to being nonzero, thus
precluding the use of refcount_inc_not_zero(). Furthermore, as described
below, it's possible to extract the fields of the union without changing
the size of task_struct.

There are several possible ways to enable this:

1. The lightest touch approach is likely the one proposed in this patch,
   which is to simply extract 'rcu_users' and 'rcu' from the union, so
   that scheduling the 'rcu' callback doesn't overwrite the 'rcu_users'
   refcount. If we have a trusted task pointer, this would allow us to
   use refcnt_inc_not_zero() inside of an RCU region to determine if we
   can safely acquire a reference to the task and store it in a map. As
   mentioned below, this can be done without changing the size of
   task_struct, by moving the location of the union to another location
   that has padding gaps we can fill in.

2. Removing 'refcount_t rcu_users', and instead having the entire task
   be freed in an rcu callback. This is likely the most sound overall
   design, though it changes the behavioral semantics exposed to
   callers, who currently expect that a task that's successfully looked
   up in e.g. the pid_list with find_task_by_pid_ns(), can always have a
   'usage' reference acquired on them, as it's guaranteed to be >
   0 until after the next gp. In order for this approach to work, we'd
   have to audit all callers. This approach also slightly changes
   behavior observed by user space by not invoking
   trace_sched_process_free() until the whole task_struct is actually being
   freed, rather than just after it's exited. It also may change
   timings, as memory will be freed in an RCU callback rather than
   immediately when the final 'usage' refcount drops to 0. This also is
   arguably a benefit, as it provides more predictable performance to
   callers who are refcounting tasks.

3. There may be other solutions as well that don't require changing the
   layout of task_struct. For example, we could possibly do something
   complex from the BPF side, such as listen for task exit and remove a
   task from a map when the task is exiting. This would likely require
   significant custom handling for task_struct in the verifier, so a
   more generalizable solution is likely warranted.

As mentioned above, this patch proposes the lightest-touch approach
which allows callers elsewhere in the kernel to use 'rcu_users' to
ensure the lifetime of a task, by extracting 'rcu_users' and 'rcu' from
the union. There is no size change in task_struct with this patch.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: David Vernet <void@manifault.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20230215233033.889644-1-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/sched.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63d242164b1a..b11b4517760f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1318,11 +1318,6 @@ struct task_struct {
 
 	struct tlbflush_unmap_batch	tlb_ubc;
 
-	union {
-		refcount_t		rcu_users;
-		struct rcu_head		rcu;
-	};
-
 	/* Cache last used pipe for splice(): */
 	struct pipe_inode_info		*splice_pipe;
 
@@ -1459,6 +1454,8 @@ struct task_struct {
 	unsigned long			saved_state_change;
 # endif
 #endif
+	struct rcu_head			rcu;
+	refcount_t			rcu_users;
 	int				pagefault_disabled;
 #ifdef CONFIG_MMU
 	struct task_struct		*oom_reaper_list;
-- 
cgit v1.2.3


From 9e36a204bd43553a9cd4bd574612cd9a5df791ea Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Mon, 13 Mar 2023 14:46:41 -0700
Subject: bpf: Disable migration when freeing stashed local kptr using obj drop

When a local kptr is stashed in a map and freed when the map goes away,
currently an error like the below appears:

[   39.195695] BUG: using smp_processor_id() in preemptible [00000000] code: kworker/u32:15/2875
[   39.196549] caller is bpf_mem_free+0x56/0xc0
[   39.196958] CPU: 15 PID: 2875 Comm: kworker/u32:15 Tainted: G           O       6.2.0-13016-g22df776a9a86 #4477
[   39.197897] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
[   39.198949] Workqueue: events_unbound bpf_map_free_deferred
[   39.199470] Call Trace:
[   39.199703]  <TASK>
[   39.199911]  dump_stack_lvl+0x60/0x70
[   39.200267]  check_preemption_disabled+0xbf/0xe0
[   39.200704]  bpf_mem_free+0x56/0xc0
[   39.201032]  ? bpf_obj_new_impl+0xa0/0xa0
[   39.201430]  bpf_obj_free_fields+0x1cd/0x200
[   39.201838]  array_map_free+0xad/0x220
[   39.202193]  ? finish_task_switch+0xe5/0x3c0
[   39.202614]  bpf_map_free_deferred+0xea/0x210
[   39.203006]  ? lockdep_hardirqs_on_prepare+0xe/0x220
[   39.203460]  process_one_work+0x64f/0xbe0
[   39.203822]  ? pwq_dec_nr_in_flight+0x110/0x110
[   39.204264]  ? do_raw_spin_lock+0x107/0x1c0
[   39.204662]  ? lockdep_hardirqs_on_prepare+0xe/0x220
[   39.205107]  worker_thread+0x74/0x7a0
[   39.205451]  ? process_one_work+0xbe0/0xbe0
[   39.205818]  kthread+0x171/0x1a0
[   39.206111]  ? kthread_complete_and_exit+0x20/0x20
[   39.206552]  ret_from_fork+0x1f/0x30
[   39.206886]  </TASK>

This happens because the call to __bpf_obj_drop_impl I added in the patch
adding support for stashing local kptrs doesn't disable migration. Prior
to that patch, __bpf_obj_drop_impl logic only ran when called by a BPF
progarm, whereas now it can be called from map free path, so it's
necessary to explicitly disable migration.

Also, refactor a bit to just call __bpf_obj_drop_impl directly instead
of bothering w/ dtor union and setting pointer-to-obj_drop.

Fixes: c8e187540914 ("bpf: Support __kptr to local kptrs")
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230313214641.3731908-1-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 756b85f0d0d3..71cc92a4ba48 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -190,18 +190,14 @@ enum btf_field_type {
 };
 
 typedef void (*btf_dtor_kfunc_t)(void *);
-typedef void (*btf_dtor_obj_drop)(void *, const struct btf_record *);
 
 struct btf_field_kptr {
 	struct btf *btf;
 	struct module *module;
-	union {
-		/* dtor used if btf_is_kernel(btf), otherwise the type
-		 * is program-allocated and obj_drop is used
-		 */
-		btf_dtor_kfunc_t dtor;
-		btf_dtor_obj_drop obj_drop;
-	};
+	/* dtor used if btf_is_kernel(btf), otherwise the type is
+	 * program-allocated, dtor is NULL,  and __bpf_obj_drop_impl is used
+	 */
+	btf_dtor_kfunc_t dtor;
 	u32 btf_id;
 };
 
-- 
cgit v1.2.3


From 2c854e5fcd7e243f5a7cf6a6afa0ef83060c903c Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Mon, 13 Mar 2023 22:55:51 +0100
Subject: net: page_pool, skbuff: make skb_mark_for_recycle() always available

skb_mark_for_recycle() is guarded with CONFIG_PAGE_POOL, this creates
unneeded complication when using it in the generic code. For now, it's
only used in the drivers always selecting Page Pool, so this works.
Move the guards so that preprocessor will cut out only the operation
itself and the function will still be a noop on !PAGE_POOL systems,
but available there as well.
No functional changes.

Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/oe-kbuild-all/202303020342.Wi2PRFFH-lkp@intel.com
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://lore.kernel.org/r/20230313215553.1045175-3-aleksander.lobakin@intel.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/skbuff.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fe661011644b..3f3a2a82a86b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -5069,12 +5069,12 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
 #endif
 }
 
-#ifdef CONFIG_PAGE_POOL
 static inline void skb_mark_for_recycle(struct sk_buff *skb)
 {
+#ifdef CONFIG_PAGE_POOL
 	skb->pp_recycle = 1;
-}
 #endif
+}
 
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
-- 
cgit v1.2.3


From 31bf1dbccfb0a9861d4846755096b3fff5687f8a Mon Sep 17 00:00:00 2001
From: Viktor Malik <vmalik@redhat.com>
Date: Fri, 10 Mar 2023 08:40:59 +0100
Subject: bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules

This resolves two problems with attachment of fentry/fexit/fmod_ret/lsm
to functions located in modules:

1. The verifier tries to find the address to attach to in kallsyms. This
   is always done by searching the entire kallsyms, not respecting the
   module in which the function is located. Such approach causes an
   incorrect attachment address to be computed if the function to attach
   to is shadowed by a function of the same name located earlier in
   kallsyms.

2. If the address to attach to is located in a module, the module
   reference is only acquired in register_fentry. If the module is
   unloaded between the place where the address is found
   (bpf_check_attach_target in the verifier) and register_fentry, it is
   possible that another module is loaded to the same address which may
   lead to potential errors.

Since the attachment must contain the BTF of the program to attach to,
we extract the module from it and search for the function address in the
correct module (resolving problem no. 1). Then, the module reference is
taken directly in bpf_check_attach_target and stored in the bpf program
(in bpf_prog_aux). The reference is only released when the program is
unloaded (resolving problem no. 2).

Signed-off-by: Viktor Malik <vmalik@redhat.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Link: https://lore.kernel.org/r/3f6a9d8ae850532b5ef864ef16327b0f7a669063.1678432753.git.vmalik@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 71cc92a4ba48..3ef98fb92987 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1103,6 +1103,7 @@ struct bpf_trampoline {
 struct bpf_attach_target_info {
 	struct btf_func_model fmodel;
 	long tgt_addr;
+	struct module *tgt_mod;
 	const char *tgt_name;
 	const struct btf_type *tgt_type;
 };
@@ -1406,6 +1407,7 @@ struct bpf_prog_aux {
 	 * main prog always has linfo_idx == 0
 	 */
 	u32 linfo_idx;
+	struct module *mod;
 	u32 num_exentries;
 	struct exception_table_entry *extable;
 	union {
-- 
cgit v1.2.3


From bd5314f8dd2d41330eecb60f0490c3fcfe1fc99d Mon Sep 17 00:00:00 2001
From: Viktor Malik <vmalik@redhat.com>
Date: Fri, 17 Mar 2023 10:56:01 +0100
Subject: kallsyms, bpf: Move find_kallsyms_symbol_value out of internal header

Moving find_kallsyms_symbol_value from kernel/module/internal.h to
include/linux/module.h. The reason is that internal.h is not prepared to
be included when CONFIG_MODULES=n. find_kallsyms_symbol_value is used by
kernel/bpf/verifier.c and including internal.h from it (without modules)
leads into a compilation error:

  In file included from ../include/linux/container_of.h:5,
                   from ../include/linux/list.h:5,
                   from ../include/linux/timer.h:5,
                   from ../include/linux/workqueue.h:9,
                   from ../include/linux/bpf.h:10,
                   from ../include/linux/bpf-cgroup.h:5,
                   from ../kernel/bpf/verifier.c:7:
  ../kernel/bpf/../module/internal.h: In function 'mod_find':
  ../include/linux/container_of.h:20:54: error: invalid use of undefined type 'struct module'
     20 |         static_assert(__same_type(*(ptr), ((type *)0)->member) ||       \
        |                                                      ^~
  [...]

This patch fixes the above error.

Fixes: 31bf1dbccfb0 ("bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Viktor Malik <vmalik@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/oe-kbuild-all/202303161404.OrmfCy09-lkp@intel.com/
Link: https://lore.kernel.org/bpf/20230317095601.386738-1-vmalik@redhat.com
---
 include/linux/module.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 4435ad9439ab..41cfd3be57e5 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -616,6 +616,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 /* Look for this name: can be of form module:name. */
 unsigned long module_kallsyms_lookup_name(const char *name);
 
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
+
 extern void __noreturn __module_put_and_kthread_exit(struct module *mod,
 			long code);
 #define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code)
@@ -796,6 +798,12 @@ static inline unsigned long module_kallsyms_lookup_name(const char *name)
 	return 0;
 }
 
+static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
+						       const char *name)
+{
+	return 0;
+}
+
 static inline int register_module_notifier(struct notifier_block *nb)
 {
 	/* no events will happen anyway, so this can always succeed */
-- 
cgit v1.2.3


From 04aae213e719ec2bb310158c4025316ace50589b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 20 Mar 2023 18:41:13 -0700
Subject: net: skbuff: rename __pkt_vlan_present_offset to __mono_tc_offset

vlan_present is gone since
commit 354259fa73e2 ("net: remove skb->vlan_present")
rename the offset field to what BPF is currently looking
for in this byte - mono_delivery_time and tc_at_ingress.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20230321014115.997841-2-kuba@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/skbuff.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3f3a2a82a86b..5a63878a4550 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -955,7 +955,7 @@ struct sk_buff {
 	__u8			csum_valid:1;
 
 	/* private: */
-	__u8			__pkt_vlan_present_offset[0];
+	__u8			__mono_tc_offset[0];
 	/* public: */
 	__u8			remcsum_offload:1;
 	__u8			csum_complete_sw:1;
@@ -1078,7 +1078,7 @@ struct sk_buff {
 #define TC_AT_INGRESS_MASK		(1 << 7)
 #define SKB_MONO_DELIVERY_TIME_MASK	(1 << 5)
 #endif
-#define PKT_VLAN_PRESENT_OFFSET	offsetof(struct sk_buff, __pkt_vlan_present_offset)
+#define SKB_BF_MONO_TC_OFFSET		offsetof(struct sk_buff, __mono_tc_offset)
 
 #ifdef __KERNEL__
 /*
-- 
cgit v1.2.3


From b94e032b7ad6318b36615f0e0cc3b0d61a5531e8 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 20 Mar 2023 18:41:14 -0700
Subject: net: skbuff: reorder bytes 2 and 3 of the bitfield

BPF needs to know the offsets of fields it tries to access.
Zero-length fields are added to make offsetof() work.
This unfortunately partitions the bitfield (fields across
the zero-length members can't be coalesced).

Reorder bytes 2 and 3, BPF needs to know the offset of fields
previously in byte 3 and some fields in byte 2 should really
be optional.

The two bytes are always in the same cacheline so it should
not matter.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20230321014115.997841-3-kuba@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/skbuff.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5a63878a4550..36d31e74db37 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -944,16 +944,6 @@ struct sk_buff {
 	__u8			ip_summed:2;
 	__u8			ooo_okay:1;
 
-	__u8			l4_hash:1;
-	__u8			sw_hash:1;
-	__u8			wifi_acked_valid:1;
-	__u8			wifi_acked:1;
-	__u8			no_fcs:1;
-	/* Indicates the inner headers are valid in the skbuff. */
-	__u8			encapsulation:1;
-	__u8			encap_hdr_csum:1;
-	__u8			csum_valid:1;
-
 	/* private: */
 	__u8			__mono_tc_offset[0];
 	/* public: */
@@ -966,6 +956,16 @@ struct sk_buff {
 	__u8			tc_skip_classify:1;
 	__u8			tc_at_ingress:1;	/* See TC_AT_INGRESS_MASK */
 #endif
+
+	__u8			l4_hash:1;
+	__u8			sw_hash:1;
+	__u8			wifi_acked_valid:1;
+	__u8			wifi_acked:1;
+	__u8			no_fcs:1;
+	/* Indicates the inner headers are valid in the skbuff. */
+	__u8			encapsulation:1;
+	__u8			encap_hdr_csum:1;
+	__u8			csum_valid:1;
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	__u8			ndisc_nodetype:2;
 #endif
-- 
cgit v1.2.3


From c0ba861117c3e8deb03855d7dc5a7717958bbb18 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 20 Mar 2023 18:41:15 -0700
Subject: net: skbuff: move the fields BPF cares about directly next to the
 offset marker

To avoid more possible BPF dependencies with moving bitfields
around keep the fields BPF cares about right next to the offset
marker.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20230321014115.997841-4-kuba@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/skbuff.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 36d31e74db37..6aeb0e7b9511 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -947,15 +947,15 @@ struct sk_buff {
 	/* private: */
 	__u8			__mono_tc_offset[0];
 	/* public: */
-	__u8			remcsum_offload:1;
-	__u8			csum_complete_sw:1;
-	__u8			csum_level:2;
-	__u8			dst_pending_confirm:1;
 	__u8			mono_delivery_time:1;	/* See SKB_MONO_DELIVERY_TIME_MASK */
 #ifdef CONFIG_NET_CLS_ACT
-	__u8			tc_skip_classify:1;
 	__u8			tc_at_ingress:1;	/* See TC_AT_INGRESS_MASK */
+	__u8			tc_skip_classify:1;
 #endif
+	__u8			remcsum_offload:1;
+	__u8			csum_complete_sw:1;
+	__u8			csum_level:2;
+	__u8			dst_pending_confirm:1;
 
 	__u8			l4_hash:1;
 	__u8			sw_hash:1;
@@ -1072,11 +1072,11 @@ struct sk_buff {
  * around, you also must adapt these constants.
  */
 #ifdef __BIG_ENDIAN_BITFIELD
-#define TC_AT_INGRESS_MASK		(1 << 0)
-#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 2)
+#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 7)
+#define TC_AT_INGRESS_MASK		(1 << 6)
 #else
-#define TC_AT_INGRESS_MASK		(1 << 7)
-#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 5)
+#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 0)
+#define TC_AT_INGRESS_MASK		(1 << 1)
 #endif
 #define SKB_BF_MONO_TC_OFFSET		offsetof(struct sk_buff, __mono_tc_offset)
 
-- 
cgit v1.2.3


From d7ba4cc900bf1eea2d8c807c6b1fc6bd61f41237 Mon Sep 17 00:00:00 2001
From: JP Kobryn <inwardvessel@gmail.com>
Date: Wed, 22 Mar 2023 12:47:54 -0700
Subject: bpf: return long from bpf_map_ops funcs

This patch changes the return types of bpf_map_ops functions to long, where
previously int was returned. Using long allows for bpf programs to maintain
the sign bit in the absence of sign extension during situations where
inlined bpf helper funcs make calls to the bpf_map_ops funcs and a negative
error is returned.

The definitions of the helper funcs are generated from comments in the bpf
uapi header at `include/uapi/linux/bpf.h`. The return type of these
helpers was previously changed from int to long in commit bdb7b79b4ce8. For
any case where one of the map helpers call the bpf_map_ops funcs that are
still returning 32-bit int, a compiler might not include sign extension
instructions to properly convert the 32-bit negative value a 64-bit
negative value.

For example:
bpf assembly excerpt of an inlined helper calling a kernel function and
checking for a specific error:

; err = bpf_map_update_elem(&mymap, &key, &val, BPF_NOEXIST);
  ...
  46:	call   0xffffffffe103291c	; htab_map_update_elem
; if (err && err != -EEXIST) {
  4b:	cmp    $0xffffffffffffffef,%rax ; cmp -EEXIST,%rax

kernel function assembly excerpt of return value from
`htab_map_update_elem` returning 32-bit int:

movl $0xffffffef, %r9d
...
movl %r9d, %eax

...results in the comparison:
cmp $0xffffffffffffffef, $0x00000000ffffffef

Fixes: bdb7b79b4ce8 ("bpf: Switch most helper return values from 32-bit int to 64-bit long")
Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
Link: https://lore.kernel.org/r/20230322194754.185781-3-inwardvessel@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h    | 14 +++++++-------
 include/linux/filter.h |  6 +++---
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3ef98fb92987..ec0df059f562 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -96,11 +96,11 @@ struct bpf_map_ops {
 
 	/* funcs callable from userspace and from eBPF programs */
 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
-	int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
-	int (*map_delete_elem)(struct bpf_map *map, void *key);
-	int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);
-	int (*map_pop_elem)(struct bpf_map *map, void *value);
-	int (*map_peek_elem)(struct bpf_map *map, void *value);
+	long (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
+	long (*map_delete_elem)(struct bpf_map *map, void *key);
+	long (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);
+	long (*map_pop_elem)(struct bpf_map *map, void *value);
+	long (*map_peek_elem)(struct bpf_map *map, void *value);
 	void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu);
 
 	/* funcs called by prog_array and perf_event_array map */
@@ -139,7 +139,7 @@ struct bpf_map_ops {
 	struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner);
 
 	/* Misc helpers.*/
-	int (*map_redirect)(struct bpf_map *map, u64 key, u64 flags);
+	long (*map_redirect)(struct bpf_map *map, u64 key, u64 flags);
 
 	/* map_meta_equal must be implemented for maps that can be
 	 * used as an inner map.  It is a runtime check to ensure
@@ -157,7 +157,7 @@ struct bpf_map_ops {
 	int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env,
 					      struct bpf_func_state *caller,
 					      struct bpf_func_state *callee);
-	int (*map_for_each_callback)(struct bpf_map *map,
+	long (*map_for_each_callback)(struct bpf_map *map,
 				     bpf_callback_t callback_fn,
 				     void *callback_ctx, u64 flags);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index efa5d4a1677e..23c08c31bea9 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1504,9 +1504,9 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
 }
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 
-static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index,
-						  u64 flags, const u64 flag_mask,
-						  void *lookup_elem(struct bpf_map *map, u32 key))
+static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 index,
+						   u64 flags, const u64 flag_mask,
+						   void *lookup_elem(struct bpf_map *map, u32 key))
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;
-- 
cgit v1.2.3


From b671c2067a04c0668df174ff5dfdb573d1f9b074 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@meta.com>
Date: Wed, 22 Mar 2023 20:23:58 -0700
Subject: bpf: Retire the struct_ops map kvalue->refcnt.

We have replaced kvalue-refcnt with synchronize_rcu() to wait for an
RCU grace period.

Maintenance of kvalue->refcnt was a complicated task, as we had to
simultaneously keep track of two reference counts: one for the
reference count of bpf_map. When the kvalue->refcnt reaches zero, we
also have to reduce the reference count on bpf_map - yet these steps
are not performed in an atomic manner and require us to be vigilant
when managing them. By eliminating kvalue->refcnt, we can make our
maintenance more straightforward as the refcount of bpf_map is now
solely managed!

To prevent the trampoline image of a struct_ops from being released
while it is still in use, we wait for an RCU grace period. The
setsockopt(TCP_CONGESTION, "...") command allows you to change your
socket's congestion control algorithm and can result in releasing the
old struct_ops implementation. It is fine. However, this function is
exposed through bpf_setsockopt(), it may be accessed by BPF programs
as well. To ensure that the trampoline image belonging to struct_op
can be safely called while its method is in use, the trampoline
safeguarde the BPF program with rcu_read_lock(). Doing so prevents any
destruction of the associated images before returning from a
trampoline and requires us to wait for an RCU grace period.

Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Link: https://lore.kernel.org/r/20230323032405.3735486-2-kuifeng@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ec0df059f562..f04098468d7a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1945,6 +1945,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
 void bpf_map_inc(struct bpf_map *map);
 void bpf_map_inc_with_uref(struct bpf_map *map);
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref);
 struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
-- 
cgit v1.2.3


From 68b04864ca425d1894c96b8141d4fba1181f11cb Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@meta.com>
Date: Wed, 22 Mar 2023 20:24:00 -0700
Subject: bpf: Create links for BPF struct_ops maps.

Make bpf_link support struct_ops.  Previously, struct_ops were always
used alone without any associated links. Upon updating its value, a
struct_ops would be activated automatically. Yet other BPF program
types required to make a bpf_link with their instances before they
could become active. Now, however, you can create an inactive
struct_ops, and create a link to activate it later.

With bpf_links, struct_ops has a behavior similar to other BPF program
types. You can pin/unpin them from their links and the struct_ops will
be deactivated when its link is removed while previously need someone
to delete the value for it to be deactivated.

bpf_links are responsible for registering their associated
struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag
set to create a bpf_link, while a structs without this flag behaves in
the same manner as before and is registered upon updating its value.

The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it
used to craft the links for BPF struct_ops programs, but also to
create links for BPF struct_ops them-self.  Since the links of BPF
struct_ops programs are only used to create trampolines internally,
they are never seen in other contexts. Thus, they can be reused for
struct_ops themself.

To maintain a reference to the map supporting this link, we add
bpf_struct_ops_link as an additional type. The pointer of the map is
RCU and won't be necessary until later in the patchset.

Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Link: https://lore.kernel.org/r/20230323032405.3735486-4-kuifeng@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f04098468d7a..8552279efe46 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1518,6 +1518,7 @@ struct bpf_struct_ops {
 			   void *kdata, const void *udata);
 	int (*reg)(void *kdata);
 	void (*unreg)(void *kdata);
+	int (*validate)(void *kdata);
 	const struct btf_type *type;
 	const struct btf_type *value_type;
 	const char *name;
@@ -1552,6 +1553,7 @@ static inline void bpf_module_put(const void *data, struct module *owner)
 	else
 		module_put(owner);
 }
+int bpf_struct_ops_link_create(union bpf_attr *attr);
 
 #ifdef CONFIG_NET
 /* Define it here to avoid the use of forward declaration */
@@ -1592,6 +1594,11 @@ static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map,
 {
 	return -EINVAL;
 }
+static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
-- 
cgit v1.2.3


From aef56f2e918bf8fc8de25f0b36e8c2aba44116ec Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@meta.com>
Date: Wed, 22 Mar 2023 20:24:02 -0700
Subject: bpf: Update the struct_ops of a bpf_link.

By improving the BPF_LINK_UPDATE command of bpf(), it should allow you
to conveniently switch between different struct_ops on a single
bpf_link. This would enable smoother transitions from one struct_ops
to another.

The struct_ops maps passing along with BPF_LINK_UPDATE should have the
BPF_F_LINK flag.

Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230323032405.3735486-6-kuifeng@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8552279efe46..2d8f3f639e68 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1476,6 +1476,8 @@ struct bpf_link_ops {
 	void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq);
 	int (*fill_link_info)(const struct bpf_link *link,
 			      struct bpf_link_info *info);
+	int (*update_map)(struct bpf_link *link, struct bpf_map *new_map,
+			  struct bpf_map *old_map);
 };
 
 struct bpf_tramp_link {
@@ -1518,6 +1520,7 @@ struct bpf_struct_ops {
 			   void *kdata, const void *udata);
 	int (*reg)(void *kdata);
 	void (*unreg)(void *kdata);
+	int (*update)(void *kdata, void *old_kdata);
 	int (*validate)(void *kdata);
 	const struct btf_type *type;
 	const struct btf_type *value_type;
-- 
cgit v1.2.3


From e65a5c6edbc6ca4853e6076bd81db1a410592a09 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 22 Mar 2023 14:52:42 -0700
Subject: bpf: Add a few bpf mem allocator functions

This patch adds a few bpf mem allocator functions which will
be used in the bpf_local_storage in a later patch.

bpf_mem_cache_alloc_flags(..., gfp_t flags) is added. When the
flags == GFP_KERNEL, it will fallback to __alloc(..., GFP_KERNEL).
bpf_local_storage knows its running context is sleepable (GFP_KERNEL)
and provides a better guarantee on memory allocation.

bpf_local_storage has some uncommon cases that its selem
cannot be reused immediately. It handles its own
rcu_head and goes through a rcu_trace gp and then free it.
bpf_mem_cache_raw_free() is added for direct free purpose
without leaking the LLIST_NODE_SZ internal knowledge.
During free time, the 'struct bpf_mem_alloc *ma' is no longer
available. However, the caller should know if it is
percpu memory or not and it can call different raw_free functions.
bpf_local_storage does not support percpu value, so only
the non-percpu 'bpf_mem_cache_raw_free()' is added in
this patch.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230322215246.1675516-2-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index a7104af61ab4..3929be5743f4 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -31,5 +31,7 @@ void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
 /* kmem_cache_alloc/free equivalent: */
 void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
 void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
+void bpf_mem_cache_raw_free(void *ptr);
+void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags);
 
 #endif /* _BPF_MEM_ALLOC_H */
-- 
cgit v1.2.3


From 08a7ce384e33e53e0732c500a8af67a73f8fceca Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 22 Mar 2023 14:52:43 -0700
Subject: bpf: Use bpf_mem_cache_alloc/free in bpf_local_storage_elem

This patch uses bpf_mem_alloc for the task and cgroup local storage that
the bpf prog can easily get a hold of the storage owner's PTR_TO_BTF_ID.
eg. bpf_get_current_task_btf() can be used in some of the kmalloc code
path which will cause deadlock/recursion. bpf_mem_cache_alloc is
deadlock free and will solve a legit use case in [1].

For sk storage, its batch creation benchmark shows a few percent
regression when the sk create/destroy batch size is larger than 32.
The sk creation/destruction happens much more often and
depends on external traffic. Considering it is hypothetical
to be able to cause deadlock with sk storage, it can cross
the bridge to use bpf_mem_alloc till a legit (ie. useful)
use case comes up.

For inode storage, bpf_local_storage_destroy() is called before
waiting for a rcu gp and its memory cannot be reused immediately.
inode stays with kmalloc/kfree after the rcu [or tasks_trace] gp.

A 'bool bpf_ma' argument is added to bpf_local_storage_map_alloc().
Only task and cgroup storage have 'bpf_ma == true' which
means to use bpf_mem_cache_alloc/free(). This patch only changes
selem to use bpf_mem_alloc for task and cgroup. The next patch
will change the local_storage to use bpf_mem_alloc also for
task and cgroup.

Here is some more details on the changes:

* memory allocation:
After bpf_mem_cache_alloc(), the SDATA(selem)->data is zero-ed because
bpf_mem_cache_alloc() could return a reused selem. It is to keep
the existing bpf_map_kzalloc() behavior. Only SDATA(selem)->data
is zero-ed. SDATA(selem)->data is the visible part to the bpf prog.
No need to use zero_map_value() to do the zeroing because
bpf_selem_free(..., reuse_now = true) ensures no bpf prog is using
the selem before returning the selem through bpf_mem_cache_free().
For the internal fields of selem, they will be initialized when
linking to the new smap and the new local_storage.

When 'bpf_ma == false', nothing changes in this patch. It will
stay with the bpf_map_kzalloc().

* memory free:
The bpf_selem_free() and bpf_selem_free_rcu() are modified to handle
the bpf_ma == true case.

For the common selem free path where its owner is also being destroyed,
the mem is freed in bpf_local_storage_destroy(), the owner (task
and cgroup) has gone through a rcu gp. The memory can be reused
immediately, so bpf_local_storage_destroy() will call
bpf_selem_free(..., reuse_now = true) which will do
bpf_mem_cache_free() for immediate reuse consideration.

An exception is the delete elem code path. The delete elem code path
is called from the helper bpf_*_storage_delete() and the syscall
bpf_map_delete_elem(). This path is an unusual case for local
storage because the common use case is to have the local storage
staying with its owner life time so that the bpf prog and the user
space does not have to monitor the owner's destruction. For the delete
elem path, the selem cannot be reused immediately because there could
be bpf prog using it. It will call bpf_selem_free(..., reuse_now = false)
and it will wait for a rcu tasks trace gp before freeing the elem. The
rcu callback is changed to do bpf_mem_cache_raw_free() instead of kfree().

When 'bpf_ma == false', it should be the same as before.
__bpf_selem_free() is added to do the kfree_rcu and call_tasks_trace_rcu().
A few words on the 'reuse_now == true'. When 'reuse_now == true',
it is still racing with bpf_local_storage_map_free which is under rcu
protection, so it still needs to wait for a rcu gp instead of kfree().
Otherwise, the selem may be reused by slab for a totally different struct
while the bpf_local_storage_map_free() is still using it (as a
rcu reader). For the inode case, there may be other rcu readers also.
In short, when bpf_ma == false and reuse_now == true => vanilla rcu.

[1]: https://lore.kernel.org/bpf/20221118190109.1512674-1-namhyung@kernel.org/

Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230322215246.1675516-3-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index a34f61467a2f..30efbcab2798 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -13,6 +13,7 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 #include <linux/types.h>
+#include <linux/bpf_mem_alloc.h>
 #include <uapi/linux/btf.h>
 
 #define BPF_LOCAL_STORAGE_CACHE_SIZE	16
@@ -55,6 +56,8 @@ struct bpf_local_storage_map {
 	u32 bucket_log;
 	u16 elem_size;
 	u16 cache_idx;
+	struct bpf_mem_alloc selem_ma;
+	bool bpf_ma;
 };
 
 struct bpf_local_storage_data {
@@ -122,7 +125,8 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
 
 struct bpf_map *
 bpf_local_storage_map_alloc(union bpf_attr *attr,
-			    struct bpf_local_storage_cache *cache);
+			    struct bpf_local_storage_cache *cache,
+			    bool bpf_ma);
 
 struct bpf_local_storage_data *
 bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
-- 
cgit v1.2.3


From 6ae9d5e99e1dd26babdd9502759fa25a3fd348ad Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 22 Mar 2023 14:52:44 -0700
Subject: bpf: Use bpf_mem_cache_alloc/free for bpf_local_storage

This patch uses bpf_mem_cache_alloc/free for allocating and freeing
bpf_local_storage for task and cgroup storage.

The changes are similar to the previous patch. A few things that
worth to mention for bpf_local_storage:

The local_storage is freed when the last selem is deleted.
Before deleting a selem from local_storage, it needs to retrieve the
local_storage->smap because the bpf_selem_unlink_storage_nolock()
may have set it to NULL. Note that local_storage->smap may have
already been NULL when the selem created this local_storage has
been removed. In this case, call_rcu will be used to free the
local_storage.
Also, the bpf_ma (true or false) value is needed before calling
bpf_local_storage_free(). The bpf_ma can either be obtained from
the local_storage->smap (if available) or any of its selem's smap.
A new helper check_storage_bpf_ma() is added to obtain
bpf_ma for a deleting bpf_local_storage.

When bpf_local_storage_alloc getting a reused memory, all
fields are either in the correct values or will be initialized.
'cache[]' must already be all NULLs. 'list' must be empty.
Others will be initialized.

Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230322215246.1675516-4-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 30efbcab2798..173ec7f43ed1 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -57,6 +57,7 @@ struct bpf_local_storage_map {
 	u16 elem_size;
 	u16 cache_idx;
 	struct bpf_mem_alloc selem_ma;
+	struct bpf_mem_alloc storage_ma;
 	bool bpf_ma;
 };
 
-- 
cgit v1.2.3


From b7e852a9ec96635168c04204fb7cf1f7390b9a8c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 3 Apr 2023 21:50:23 -0700
Subject: bpf: Remove unused arguments from btf_struct_access().

Remove unused arguments from btf_struct_access() callback.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230404045029.82870-3-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h    | 3 +--
 include/linux/filter.h | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2d8f3f639e68..4f689dda748f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -893,8 +893,7 @@ struct bpf_verifier_ops {
 				  struct bpf_prog *prog, u32 *target_size);
 	int (*btf_struct_access)(struct bpf_verifier_log *log,
 				 const struct bpf_reg_state *reg,
-				 int off, int size, enum bpf_access_type atype,
-				 u32 *next_btf_id, enum bpf_type_flag *flag);
+				 int off, int size);
 };
 
 struct bpf_prog_offload_ops {
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 23c08c31bea9..5364b0c52c1d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -571,8 +571,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 extern struct mutex nf_conn_btf_access_lock;
 extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
 				     const struct bpf_reg_state *reg,
-				     int off, int size, enum bpf_access_type atype,
-				     u32 *next_btf_id, enum bpf_type_flag *flag);
+				     int off, int size);
 
 typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
 					  const struct bpf_insn *insnsi,
-- 
cgit v1.2.3


From 63260df1396578226ac3134cf7f764690002e70e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 3 Apr 2023 21:50:24 -0700
Subject: bpf: Refactor btf_nested_type_is_trusted().

btf_nested_type_is_trusted() tries to find a struct member at corresponding offset.
It works for flat structures and falls apart in more complex structs with nested structs.
The offset->member search is already performed by btf_struct_walk() including nested structs.
Reuse this work and pass {field name, field btf id} into btf_nested_type_is_trusted()
instead of offset to make BTF_TYPE_SAFE*() logic more robust.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230404045029.82870-4-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4f689dda748f..002a811b6b90 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2263,7 +2263,7 @@ static inline bool bpf_tracing_btf_ctx_access(int off, int size,
 int btf_struct_access(struct bpf_verifier_log *log,
 		      const struct bpf_reg_state *reg,
 		      int off, int size, enum bpf_access_type atype,
-		      u32 *next_btf_id, enum bpf_type_flag *flag);
+		      u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name);
 bool btf_struct_ids_match(struct bpf_verifier_log *log,
 			  const struct btf *btf, u32 id, int off,
 			  const struct btf *need_btf, u32 need_type_id,
@@ -2302,7 +2302,7 @@ struct bpf_core_ctx {
 
 bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
 				const struct bpf_reg_state *reg,
-				int off, const char *suffix);
+				const char *field_name, u32 btf_id, const char *suffix);
 
 bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
 			       const struct btf *reg_btf, u32 reg_id,
@@ -2517,7 +2517,8 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id)
 static inline int btf_struct_access(struct bpf_verifier_log *log,
 				    const struct bpf_reg_state *reg,
 				    int off, int size, enum bpf_access_type atype,
-				    u32 *next_btf_id, enum bpf_type_flag *flag)
+				    u32 *next_btf_id, enum bpf_type_flag *flag,
+				    const char **field_name)
 {
 	return -EACCES;
 }
-- 
cgit v1.2.3


From 34bf93472f8fb60b4189aa2872471017e739cf0a Mon Sep 17 00:00:00 2001
From: Viktor Malik <vmalik@redhat.com>
Date: Thu, 30 Mar 2023 12:20:01 +0200
Subject: kallsyms: move module-related functions under correct configs

Functions for searching module kallsyms should have non-empty
definitions only if CONFIG_MODULES=y and CONFIG_KALLSYMS=y. Until now,
only CONFIG_MODULES check was used for many of these, which may have
caused complilation errors on some configs.

This patch moves all relevant functions under the correct configs.

Fixes: bd5314f8dd2d ("kallsyms, bpf: Move find_kallsyms_symbol_value out of internal header")
Signed-off-by: Viktor Malik <vmalik@redhat.com>
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/oe-kbuild-all/202303181535.RFDCnz3E-lkp@intel.com/
Link: https://lore.kernel.org/r/20230330102001.2183693-1-vmalik@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/module.h | 135 +++++++++++++++++++++++++++----------------------
 1 file changed, 74 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 41cfd3be57e5..886d24877c7c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -608,16 +608,6 @@ static inline bool within_module(unsigned long addr, const struct module *mod)
 /* Search for module by name: must be in a RCU-sched critical section. */
 struct module *find_module(const char *name);
 
-/* Returns 0 and fills in value, defined and namebuf, or -ERANGE if
-   symnum out of range. */
-int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
-			char *name, char *module_name, int *exported);
-
-/* Look for this name: can be of form module:name. */
-unsigned long module_kallsyms_lookup_name(const char *name);
-
-unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
-
 extern void __noreturn __module_put_and_kthread_exit(struct module *mod,
 			long code);
 #define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code)
@@ -664,17 +654,6 @@ static inline void __module_get(struct module *module)
 /* Dereference module function descriptor */
 void *dereference_module_function_descriptor(struct module *mod, void *ptr);
 
-/* For kallsyms to ask for address resolution.  namebuf should be at
- * least KSYM_NAME_LEN long: a pointer to namebuf is returned if
- * found, otherwise NULL. */
-const char *module_address_lookup(unsigned long addr,
-			    unsigned long *symbolsize,
-			    unsigned long *offset,
-			    char **modname, const unsigned char **modbuildid,
-			    char *namebuf);
-int lookup_module_symbol_name(unsigned long addr, char *symname);
-int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name);
-
 int register_module_notifier(struct notifier_block *nb);
 int unregister_module_notifier(struct notifier_block *nb);
 
@@ -765,45 +744,6 @@ static inline void module_put(struct module *module)
 
 #define module_name(mod) "kernel"
 
-/* For kallsyms to ask for address resolution.  NULL means not found. */
-static inline const char *module_address_lookup(unsigned long addr,
-					  unsigned long *symbolsize,
-					  unsigned long *offset,
-					  char **modname,
-					  const unsigned char **modbuildid,
-					  char *namebuf)
-{
-	return NULL;
-}
-
-static inline int lookup_module_symbol_name(unsigned long addr, char *symname)
-{
-	return -ERANGE;
-}
-
-static inline int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name)
-{
-	return -ERANGE;
-}
-
-static inline int module_get_kallsym(unsigned int symnum, unsigned long *value,
-					char *type, char *name,
-					char *module_name, int *exported)
-{
-	return -ERANGE;
-}
-
-static inline unsigned long module_kallsyms_lookup_name(const char *name)
-{
-	return 0;
-}
-
-static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
-						       const char *name)
-{
-	return 0;
-}
-
 static inline int register_module_notifier(struct notifier_block *nb)
 {
 	/* no events will happen anyway, so this can always succeed */
@@ -899,7 +839,36 @@ int module_kallsyms_on_each_symbol(const char *modname,
 				   int (*fn)(void *, const char *,
 					     struct module *, unsigned long),
 				   void *data);
-#else
+
+/* For kallsyms to ask for address resolution.  namebuf should be at
+ * least KSYM_NAME_LEN long: a pointer to namebuf is returned if
+ * found, otherwise NULL.
+ */
+const char *module_address_lookup(unsigned long addr,
+				  unsigned long *symbolsize,
+				  unsigned long *offset,
+				  char **modname, const unsigned char **modbuildid,
+				  char *namebuf);
+int lookup_module_symbol_name(unsigned long addr, char *symname);
+int lookup_module_symbol_attrs(unsigned long addr,
+			       unsigned long *size,
+			       unsigned long *offset,
+			       char *modname,
+			       char *name);
+
+/* Returns 0 and fills in value, defined and namebuf, or -ERANGE if
+ * symnum out of range.
+ */
+int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+		       char *name, char *module_name, int *exported);
+
+/* Look for this name: can be of form module:name. */
+unsigned long module_kallsyms_lookup_name(const char *name);
+
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
+
+#else	/* CONFIG_MODULES && CONFIG_KALLSYMS */
+
 static inline int module_kallsyms_on_each_symbol(const char *modname,
 						 int (*fn)(void *, const char *,
 						 struct module *, unsigned long),
@@ -907,6 +876,50 @@ static inline int module_kallsyms_on_each_symbol(const char *modname,
 {
 	return -EOPNOTSUPP;
 }
+
+/* For kallsyms to ask for address resolution.  NULL means not found. */
+static inline const char *module_address_lookup(unsigned long addr,
+						unsigned long *symbolsize,
+						unsigned long *offset,
+						char **modname,
+						const unsigned char **modbuildid,
+						char *namebuf)
+{
+	return NULL;
+}
+
+static inline int lookup_module_symbol_name(unsigned long addr, char *symname)
+{
+	return -ERANGE;
+}
+
+static inline int lookup_module_symbol_attrs(unsigned long addr,
+					     unsigned long *size,
+					     unsigned long *offset,
+					     char *modname,
+					     char *name)
+{
+	return -ERANGE;
+}
+
+static inline int module_get_kallsym(unsigned int symnum, unsigned long *value,
+				     char *type, char *name,
+				     char *module_name, int *exported)
+{
+	return -ERANGE;
+}
+
+static inline unsigned long module_kallsyms_lookup_name(const char *name)
+{
+	return 0;
+}
+
+static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
+						       const char *name)
+{
+	return 0;
+}
+
 #endif  /* CONFIG_MODULES && CONFIG_KALLSYMS */
 
 #endif /* _LINUX_MODULE_H */
-- 
cgit v1.2.3


From 4294a0a7ab6282c3d92f03de84e762dda993c93d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:47 -0700
Subject: bpf: Split off basic BPF verifier log into separate file

kernel/bpf/verifier.c file is large and growing larger all the time. So
it's good to start splitting off more or less self-contained parts into
separate files to keep source code size (somewhat) somewhat under
control.

This patch is a one step in this direction, moving some of BPF verifier log
routines into a separate kernel/bpf/log.c. Right now it's most low-level
and isolated routines to append data to log, reset log to previous
position, etc. Eventually we could probably move verifier state
printing logic here as well, but this patch doesn't attempt to do that
yet.

Subsequent patches will add more logic to verifier log management, so
having basics in a separate file will make sure verifier.c doesn't grow
more with new changes.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-2-andrii@kernel.org
---
 include/linux/bpf_verifier.h | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 81d525d057c7..83dff25545ee 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -498,11 +498,6 @@ struct bpf_verifier_log {
 	u32 len_total;
 };
 
-static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
-{
-	return log->len_used >= log->len_total - 1;
-}
-
 #define BPF_LOG_LEVEL1	1
 #define BPF_LOG_LEVEL2	2
 #define BPF_LOG_STATS	4
@@ -512,6 +507,11 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 #define BPF_LOG_MIN_ALIGNMENT 8U
 #define BPF_LOG_ALIGNMENT 40U
 
+static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
+{
+	return log->len_used >= log->len_total - 1;
+}
+
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 {
 	return log &&
@@ -519,13 +519,6 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 		 log->level == BPF_LOG_KERNEL);
 }
 
-static inline bool
-bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
-{
-	return log->len_total >= 128 && log->len_total <= UINT_MAX >> 2 &&
-	       log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK);
-}
-
 #define BPF_MAX_SUBPROGS 256
 
 struct bpf_subprog_info {
@@ -608,12 +601,14 @@ struct bpf_verifier_env {
 	char type_str_buf[TYPE_STR_BUF_LEN];
 };
 
+bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log);
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
 				      const char *fmt, va_list args);
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
 __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
 			    const char *fmt, ...);
+void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos);
 
 static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
-- 
cgit v1.2.3


From 1216640938035e63bdbd32438e91c9bcc1fd8ee1 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:49 -0700
Subject: bpf: Switch BPF verifier log to be a rotating log by default

Currently, if user-supplied log buffer to collect BPF verifier log turns
out to be too small to contain full log, bpf() syscall returns -ENOSPC,
fails BPF program verification/load, and preserves first N-1 bytes of
the verifier log (where N is the size of user-supplied buffer).

This is problematic in a bunch of common scenarios, especially when
working with real-world BPF programs that tend to be pretty complex as
far as verification goes and require big log buffers. Typically, it's
when debugging tricky cases at log level 2 (verbose). Also, when BPF program
is successfully validated, log level 2 is the only way to actually see
verifier state progression and all the important details.

Even with log level 1, it's possible to get -ENOSPC even if the final
verifier log fits in log buffer, if there is a code path that's deep
enough to fill up entire log, even if normally it would be reset later
on (there is a logic to chop off successfully validated portions of BPF
verifier log).

In short, it's not always possible to pre-size log buffer. Also, what's
worse, in practice, the end of the log most often is way more important
than the beginning, but verifier stops emitting log as soon as initial
log buffer is filled up.

This patch switches BPF verifier log behavior to effectively behave as
rotating log. That is, if user-supplied log buffer turns out to be too
short, verifier will keep overwriting previously written log,
effectively treating user's log buffer as a ring buffer. -ENOSPC is
still going to be returned at the end, to notify user that log contents
was truncated, but the important last N bytes of the log would be
returned, which might be all that user really needs. This consistent
-ENOSPC behavior, regardless of rotating or fixed log behavior, allows
to prevent backwards compatibility breakage. The only user-visible
change is which portion of verifier log user ends up seeing *if buffer
is too small*. Given contents of verifier log itself is not an ABI,
there is no breakage due to this behavior change. Specialized tools that
rely on specific contents of verifier log in -ENOSPC scenario are
expected to be easily adapted to accommodate old and new behaviors.

Importantly, though, to preserve good user experience and not require
every user-space application to adopt to this new behavior, before
exiting to user-space verifier will rotate log (in place) to make it
start at the very beginning of user buffer as a continuous
zero-terminated string. The contents will be a chopped off N-1 last
bytes of full verifier log, of course.

Given beginning of log is sometimes important as well, we add
BPF_LOG_FIXED (which equals 8) flag to force old behavior, which allows
tools like veristat to request first part of verifier log, if necessary.
BPF_LOG_FIXED flag is also a simple and straightforward way to check if
BPF verifier supports rotating behavior.

On the implementation side, conceptually, it's all simple. We maintain
64-bit logical start and end positions. If we need to truncate the log,
start position will be adjusted accordingly to lag end position by
N bytes. We then use those logical positions to calculate their matching
actual positions in user buffer and handle wrap around the end of the
buffer properly. Finally, right before returning from bpf_check(), we
rotate user log buffer contents in-place as necessary, to make log
contents contiguous. See comments in relevant functions for details.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-4-andrii@kernel.org
---
 include/linux/bpf_verifier.h | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 83dff25545ee..4c926227f612 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -491,25 +491,42 @@ struct bpf_insn_aux_data {
 #define BPF_VERIFIER_TMP_LOG_SIZE	1024
 
 struct bpf_verifier_log {
-	u32 level;
-	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
+	/* Logical start and end positions of a "log window" of the verifier log.
+	 * start_pos == 0 means we haven't truncated anything.
+	 * Once truncation starts to happen, start_pos + len_total == end_pos,
+	 * except during log reset situations, in which (end_pos - start_pos)
+	 * might get smaller than len_total (see bpf_vlog_reset()).
+	 * Generally, (end_pos - start_pos) gives number of useful data in
+	 * user log buffer.
+	 */
+	u64 start_pos;
+	u64 end_pos;
 	char __user *ubuf;
-	u32 len_used;
+	u32 level;
 	u32 len_total;
+	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
 };
 
 #define BPF_LOG_LEVEL1	1
 #define BPF_LOG_LEVEL2	2
 #define BPF_LOG_STATS	4
+#define BPF_LOG_FIXED	8
 #define BPF_LOG_LEVEL	(BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2)
-#define BPF_LOG_MASK	(BPF_LOG_LEVEL | BPF_LOG_STATS)
+#define BPF_LOG_MASK	(BPF_LOG_LEVEL | BPF_LOG_STATS | BPF_LOG_FIXED)
 #define BPF_LOG_KERNEL	(BPF_LOG_MASK + 1) /* kernel internal flag */
 #define BPF_LOG_MIN_ALIGNMENT 8U
 #define BPF_LOG_ALIGNMENT 40U
 
+static inline u32 bpf_log_used(const struct bpf_verifier_log *log)
+{
+	return log->end_pos - log->start_pos;
+}
+
 static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 {
-	return log->len_used >= log->len_total - 1;
+	if (log->level & BPF_LOG_FIXED)
+		return bpf_log_used(log) >= log->len_total - 1;
+	return false;
 }
 
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
@@ -596,7 +613,7 @@ struct bpf_verifier_env {
 	u32 scratched_regs;
 	/* Same as scratched_regs but for stack slots */
 	u64 scratched_stack_slots;
-	u32 prev_log_len, prev_insn_print_len;
+	u64 prev_log_pos, prev_insn_print_pos;
 	/* buffer used in reg_type_str() to generate reg_type string */
 	char type_str_buf[TYPE_STR_BUF_LEN];
 };
@@ -608,7 +625,9 @@ __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
 __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
 			    const char *fmt, ...);
-void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos);
+void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos);
+void bpf_vlog_finalize(struct bpf_verifier_log *log);
+bool bpf_vlog_truncated(const struct bpf_verifier_log *log);
 
 static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
-- 
cgit v1.2.3


From fa1c7d5cc404ac3b6e6b4ab6d00b07c76bd819be Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:57 -0700
Subject: bpf: Keep track of total log content size in both fixed and rolling
 modes

Change how we do accounting in BPF_LOG_FIXED mode and adopt log->end_pos
as *logical* log position. This means that we can go beyond physical log
buffer size now and be able to tell what log buffer size should be to
fit entire log contents without -ENOSPC.

To do this for BPF_LOG_FIXED mode, we need to remove a short-circuiting
logic of not vsnprintf()'ing further log content once we filled up
user-provided buffer, which is done by bpf_verifier_log_needed() checks.
We modify these checks to always keep going if log->level is non-zero
(i.e., log is requested), even if log->ubuf was NULL'ed out due to
copying data to user-space, or if entire log buffer is physically full.
We adopt bpf_verifier_vlog() routine to work correctly with
log->ubuf == NULL condition, performing log formatting into temporary
kernel buffer, doing all the necessary accounting, but just avoiding
copying data out if buffer is full or NULL'ed out.

With these changes, it's now possible to do this sort of determination of
log contents size in both BPF_LOG_FIXED and default rolling log mode.
We need to keep in mind bpf_vlog_reset(), though, which shrinks log
contents after successful verification of a particular code path. This
log reset means that log->end_pos isn't always increasing, so to return
back to users what should be the log buffer size to fit all log content
without causing -ENOSPC even in the presence of log resetting, we need
to keep maximum over "lifetime" of logging. We do this accounting in
bpf_vlog_update_len_max() helper.

A related and subtle aspect is that with this logical log->end_pos even in
BPF_LOG_FIXED mode we could temporary "overflow" buffer, but then reset
it back with bpf_vlog_reset() to a position inside user-supplied
log_buf. In such situation we still want to properly maintain
terminating zero. We will eventually return -ENOSPC even if final log
buffer is small (we detect this through log->len_max check). This
behavior is simpler to reason about and is consistent with current
behavior of verifier log. Handling of this required a small addition to
bpf_vlog_reset() logic to avoid doing put_user() beyond physical log
buffer dimensions.

Another issue to keep in mind is that we limit log buffer size to 32-bit
value and keep such log length as u32, but theoretically verifier could
produce huge log stretching beyond 4GB. Instead of keeping (and later
returning) 64-bit log length, we cap it at UINT_MAX. Current UAPI makes
it impossible to specify log buffer size bigger than 4GB anyways, so we
don't really loose anything here and keep everything consistently 32-bit
in UAPI. This property will be utilized in next patch.

Doing the same determination of maximum log buffer for rolling mode is
trivial, as log->end_pos and log->start_pos are already logical
positions, so there is nothing new there.

These changes do incidentally fix one small issue with previous logging
logic. Previously, if use provided log buffer of size N, and actual log
output was exactly N-1 bytes + terminating \0, kernel logic coun't
distinguish this condition from log truncation scenario which would end
up with truncated log contents of N-1 bytes + terminating \0 as well.

But now with log->end_pos being logical position that could go beyond
actual log buffer size, we can distinguish these two conditions, which
we do in this patch. This plays nicely with returning log_size_actual
(implemented in UAPI in the next patch), as we can now guarantee that if
user takes such log_size_actual and provides log buffer of that exact
size, they will not get -ENOSPC in return.

All in all, all these changes do conceptually unify fixed and rolling
log modes much better, and allow a nice feature requested by users:
knowing what should be the size of the buffer to avoid -ENOSPC.

We'll plumb this through the UAPI and the code in the next patch.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-12-andrii@kernel.org
---
 include/linux/bpf_verifier.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 4c926227f612..98d2eb382dbb 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -504,6 +504,7 @@ struct bpf_verifier_log {
 	char __user *ubuf;
 	u32 level;
 	u32 len_total;
+	u32 len_max;
 	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
 };
 
@@ -517,23 +518,16 @@ struct bpf_verifier_log {
 #define BPF_LOG_MIN_ALIGNMENT 8U
 #define BPF_LOG_ALIGNMENT 40U
 
-static inline u32 bpf_log_used(const struct bpf_verifier_log *log)
-{
-	return log->end_pos - log->start_pos;
-}
-
 static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 {
 	if (log->level & BPF_LOG_FIXED)
-		return bpf_log_used(log) >= log->len_total - 1;
+		return log->end_pos >= log->len_total;
 	return false;
 }
 
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 {
-	return log &&
-		((log->level && log->ubuf && !bpf_verifier_log_full(log)) ||
-		 log->level == BPF_LOG_KERNEL);
+	return log && log->level;
 }
 
 #define BPF_MAX_SUBPROGS 256
-- 
cgit v1.2.3


From 47a71c1f9af0a334c9dfa97633c41de4feda4287 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:58 -0700
Subject: bpf: Add log_true_size output field to return necessary log buffer
 size

Add output-only log_true_size and btf_log_true_size field to
BPF_PROG_LOAD and BPF_BTF_LOAD commands, respectively. It will return
the size of log buffer necessary to fit in all the log contents at
specified log_level. This is very useful for BPF loader libraries like
libbpf to be able to size log buffer correctly, but could be used by
users directly, if necessary, as well.

This patch plumbs all this through the code, taking into account actual
bpf_attr size provided by user to determine if these new fields are
expected by users. And if they are, set them from kernel on return.

We refactory btf_parse() function to accommodate this, moving attr and
uattr handling inside it. The rest is very straightforward code, which
is split from the logging accounting changes in the previous patch to
make it simpler to review logic vs UAPI changes.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-13-andrii@kernel.org
---
 include/linux/bpf.h | 2 +-
 include/linux/btf.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 002a811b6b90..2c6095bd7d69 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2175,7 +2175,7 @@ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size,
 			     size_t actual_size);
 
 /* verify correctness of eBPF program */
-int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr);
+int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size);
 
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
 void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
diff --git a/include/linux/btf.h b/include/linux/btf.h
index d53b10cc55f2..495250162422 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -125,7 +125,7 @@ extern const struct file_operations btf_fops;
 
 void btf_get(struct btf *btf);
 void btf_put(struct btf *btf);
-int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr);
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_sz);
 struct btf *btf_get_by_fd(int fd);
 int btf_get_info_by_fd(const struct btf *btf,
 		       const union bpf_attr *attr,
-- 
cgit v1.2.3


From bdcab4144f5da97cc0fa7e1dd63b8475e10c8f0a Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:59 -0700
Subject: bpf: Simplify internal verifier log interface

Simplify internal verifier log API down to bpf_vlog_init() and
bpf_vlog_finalize(). The former handles input arguments validation in
one place and makes it easier to change it. The latter subsumes -ENOSPC
(truncation) and -EFAULT handling and simplifies both caller's code
(bpf_check() and btf_parse()).

For btf_parse(), this patch also makes sure that verifier log
finalization happens even if there is some error condition during BTF
verification process prior to normal finalization step.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-14-andrii@kernel.org
---
 include/linux/bpf_verifier.h | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 98d2eb382dbb..f03852b89d28 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -518,13 +518,6 @@ struct bpf_verifier_log {
 #define BPF_LOG_MIN_ALIGNMENT 8U
 #define BPF_LOG_ALIGNMENT 40U
 
-static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
-{
-	if (log->level & BPF_LOG_FIXED)
-		return log->end_pos >= log->len_total;
-	return false;
-}
-
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 {
 	return log && log->level;
@@ -612,16 +605,16 @@ struct bpf_verifier_env {
 	char type_str_buf[TYPE_STR_BUF_LEN];
 };
 
-bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log);
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
 				      const char *fmt, va_list args);
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
 __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
 			    const char *fmt, ...);
+int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
+		  char __user *log_buf, u32 log_size);
 void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos);
-void bpf_vlog_finalize(struct bpf_verifier_log *log);
-bool bpf_vlog_truncated(const struct bpf_verifier_log *log);
+int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual);
 
 static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
-- 
cgit v1.2.3