From 215bf4962f6c9605710012fad222a5fec001b3ad Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 8 Mar 2023 10:41:15 -0800
Subject: bpf: add iterator kfuncs registration and validation logic

Add ability to register kfuncs that implement BPF open-coded iterator
contract and enforce naming and function proto convention. Enforcement
happens at the time of kfunc registration and significantly simplifies
the rest of iterators logic in the verifier.

More details follow in subsequent patches, but we enforce the following
conditions.

All kfuncs (constructor, next, destructor) have to be named consistenly
as bpf_iter_<type>_{new,next,destroy}(), respectively. <type> represents
iterator type, and iterator state should be represented as a matching
`struct bpf_iter_<type>` state type. Also, all iter kfuncs should have
a pointer to this `struct bpf_iter_<type>` as the very first argument.

Additionally:
  - Constructor, i.e., bpf_iter_<type>_new(), can have arbitrary extra
  number of arguments. Return type is not enforced either.
  - Next method, i.e., bpf_iter_<type>_next(), has to return a pointer
  type and should have exactly one argument: `struct bpf_iter_<type> *`
  (const/volatile/restrict and typedefs are ignored).
  - Destructor, i.e., bpf_iter_<type>_destroy(), should return void and
  should have exactly one argument, similar to the next method.
  - struct bpf_iter_<type> size is enforced to be positive and
  a multiple of 8 bytes (to fit stack slots correctly).

Such strictness and consistency allows to build generic helpers
abstracting important, but boilerplate, details to be able to use
open-coded iterators effectively and ergonomically (see bpf_for_each()
in subsequent patches). It also simplifies the verifier logic in some
places. At the same time, this doesn't hurt generality of possible
iterator implementations. Win-win.

Constructor kfunc is marked with a new KF_ITER_NEW flags, next method is
marked with KF_ITER_NEXT (and should also have KF_RET_NULL, of course),
while destructor kfunc is marked as KF_ITER_DESTROY.

Additionally, we add a trivial kfunc name validation: it should be
a valid non-NULL and non-empty string.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230308184121.1165081-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 2 ++
 include/linux/btf.h          | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'include')
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 18538bad2b8c..e2dc7f064449 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -59,6 +59,8 @@ struct bpf_active_lock {
 	u32 id;
 };
 
+#define ITER_PREFIX "bpf_iter_"
+
 struct bpf_reg_state {
 	/* Ordering of fields matters.  See states_equal() */
 	enum bpf_reg_type type;
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 556b3e2e7471..1bba0827e8c4 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -71,6 +71,10 @@
 #define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
 #define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
 #define KF_RCU          (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
+/* only one of KF_ITER_{NEW,NEXT,DESTROY} could be specified per kfunc */
+#define KF_ITER_NEW     (1 << 8) /* kfunc implements BPF iter constructor */
+#define KF_ITER_NEXT    (1 << 9) /* kfunc implements BPF iter next method */
+#define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */
 
 /*
  * Tag marking a kernel function as a kfunc. This is meant to minimize the
-- 
cgit v1.2.3


From 06accc8779c1d558a5b5a21f2ac82b0c95827ddd Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 8 Mar 2023 10:41:16 -0800
Subject: bpf: add support for open-coded iterator loops

Teach verifier about the concept of the open-coded (or inline) iterators.

This patch adds generic iterator loop verification logic, new STACK_ITER
stack slot type to contain iterator state, and necessary kfunc plumbing
for iterator's constructor, destructor and next methods. Next patch
implements first specific iterator (numbers iterator for implementing
for() loop logic). Such split allows to have more focused commits for
verifier logic and separate commit that we could point later to
demonstrating  what does it take to add a new kind of iterator.

Each kind of iterator has its own associated struct bpf_iter_<type>,
where <type> denotes a specific type of iterator. struct bpf_iter_<type>
state is supposed to live on BPF program stack, so there will be no way
to change its size later on without breaking backwards compatibility, so
choose wisely! But given this struct is specific to a given <type> of
iterator, this allows a lot of flexibility: simple iterators could be
fine with just one stack slot (8 bytes), like numbers iterator in the
next patch, while some other more complicated iterators might need way
more to keep their iterator state. Either way, such design allows to
avoid runtime memory allocations, which otherwise would be necessary if
we fixed on-the-stack size and it turned out to be too small for a given
iterator implementation.

The way BPF verifier logic is implemented, there are no artificial
restrictions on a number of active iterators, it should work correctly
using multiple active iterators at the same time. This also means you
can have multiple nested iteration loops. struct bpf_iter_<type>
reference can be safely passed to subprograms as well.

General flow is easiest to demonstrate with a simple example using
number iterator implemented in next patch. Here's the simplest possible
loop:

  struct bpf_iter_num it;
  int *v;

  bpf_iter_num_new(&it, 2, 5);
  while ((v = bpf_iter_num_next(&it))) {
      bpf_printk("X = %d", *v);
  }
  bpf_iter_num_destroy(&it);

Above snippet should output "X = 2", "X = 3", "X = 4". Note that 5 is
exclusive and is not returned. This matches similar APIs (e.g., slices
in Go or Rust) that implement a range of elements, where end index is
non-inclusive.

In the above example, we see a trio of function:
  - constructor, bpf_iter_num_new(), which initializes iterator state
  (struct bpf_iter_num it) on the stack. If any of the input arguments
  are invalid, constructor should make sure to still initialize it such
  that subsequent bpf_iter_num_next() calls will return NULL. I.e., on
  error, return error and construct empty iterator.
  - next method, bpf_iter_num_next(), which accepts pointer to iterator
  state and produces an element. Next method should always return
  a pointer. The contract between BPF verifier is that next method will
  always eventually return NULL when elements are exhausted. Once NULL is
  returned, subsequent next calls should keep returning NULL. In the
  case of numbers iterator, bpf_iter_num_next() returns a pointer to an int
  (storage for this integer is inside the iterator state itself),
  which can be dereferenced after corresponding NULL check.
  - once done with the iterator, it's mandated that user cleans up its
  state with the call to destructor, bpf_iter_num_destroy() in this
  case. Destructor frees up any resources and marks stack space used by
  struct bpf_iter_num as usable for something else.

Any other iterator implementation will have to implement at least these
three methods. It is enforced that for any given type of iterator only
applicable constructor/destructor/next are callable. I.e., verifier
ensures you can't pass number iterator state into, say, cgroup
iterator's next method.

It is important to keep the naming pattern consistent to be able to
create generic macros to help with BPF iter usability. E.g., one
of the follow up patches adds generic bpf_for_each() macro to bpf_misc.h
in selftests, which allows to utilize iterator "trio" nicely without
having to code the above somewhat tedious loop explicitly every time.
This is enforced at kfunc registration point by one of the previous
patches in this series.

At the implementation level, iterator state tracking for verification
purposes is very similar to dynptr. We add STACK_ITER stack slot type,
reserve necessary number of slots, depending on
sizeof(struct bpf_iter_<type>), and keep track of necessary extra state
in the "main" slot, which is marked with non-zero ref_obj_id. Other
slots are also marked as STACK_ITER, but have zero ref_obj_id. This is
simpler than having a separate "is_first_slot" flag.

Another big distinction is that STACK_ITER is *always refcounted*, which
simplifies implementation without sacrificing usability. So no need for
extra "iter_id", no need to anticipate reuse of STACK_ITER slots for new
constructors, etc. Keeping it simple here.

As far as the verification logic goes, there are two extensive comments:
in process_iter_next_call() and iter_active_depths_differ() explaining
some important and sometimes subtle aspects. Please refer to them for
details.

But from 10,000-foot point of view, next methods are the points of
forking a verification state, which are conceptually similar to what
verifier is doing when validating conditional jump. We branch out at
a `call bpf_iter_<type>_next` instruction and simulate two outcomes:
NULL (iteration is done) and non-NULL (new element is returned). NULL is
simulated first and is supposed to reach exit without looping. After
that non-NULL case is validated and it either reaches exit (for trivial
examples with no real loop), or reaches another `call bpf_iter_<type>_next`
instruction with the state equivalent to already (partially) validated
one. State equivalency at that point means we technically are going to
be looping forever without "breaking out" out of established "state
envelope" (i.e., subsequent iterations don't add any new knowledge or
constraints to the verifier state, so running 1, 2, 10, or a million of
them doesn't matter). But taking into account the contract stating that
iterator next method *has to* return NULL eventually, we can conclude
that loop body is safe and will eventually terminate. Given we validated
logic outside of the loop (NULL case), and concluded that loop body is
safe (though potentially looping many times), verifier can claim safety
of the overall program logic.

The rest of the patch is necessary plumbing for state tracking, marking,
validation, and necessary further kfunc plumbing to allow implementing
iterator constructor, destructor, and next methods.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230308184121.1165081-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e2dc7f064449..0c052bc79940 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -61,6 +61,12 @@ struct bpf_active_lock {
 
 #define ITER_PREFIX "bpf_iter_"
 
+enum bpf_iter_state {
+	BPF_ITER_STATE_INVALID, /* for non-first slot */
+	BPF_ITER_STATE_ACTIVE,
+	BPF_ITER_STATE_DRAINED,
+};
+
 struct bpf_reg_state {
 	/* Ordering of fields matters.  See states_equal() */
 	enum bpf_reg_type type;
@@ -105,6 +111,18 @@ struct bpf_reg_state {
 			bool first_slot;
 		} dynptr;
 
+		/* For bpf_iter stack slots */
+		struct {
+			/* BTF container and BTF type ID describing
+			 * struct bpf_iter_<type> of an iterator state
+			 */
+			struct btf *btf;
+			u32 btf_id;
+			/* packing following two fields to fit iter state into 16 bytes */
+			enum bpf_iter_state state:2;
+			int depth:30;
+		} iter;
+
 		/* Max size from any of the above. */
 		struct {
 			unsigned long raw1;
@@ -143,6 +161,8 @@ struct bpf_reg_state {
 	 * same reference to the socket, to determine proper reference freeing.
 	 * For stack slots that are dynptrs, this is used to track references to
 	 * the dynptr to determine proper reference freeing.
+	 * Similarly to dynptrs, we use ID to track "belonging" of a reference
+	 * to a specific instance of bpf_iter.
 	 */
 	u32 id;
 	/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
@@ -213,9 +233,11 @@ enum bpf_stack_slot_type {
 	 * is stored in bpf_stack_state->spilled_ptr.dynptr.type
 	 */
 	STACK_DYNPTR,
+	STACK_ITER,
 };
 
 #define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
+
 #define BPF_DYNPTR_SIZE		sizeof(struct bpf_dynptr_kern)
 #define BPF_DYNPTR_NR_SLOTS		(BPF_DYNPTR_SIZE / BPF_REG_SIZE)
 
@@ -450,6 +472,7 @@ struct bpf_insn_aux_data {
 	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
 	bool zext_dst; /* this insn zero extends dst reg */
 	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
+	bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
 	u8 alu_state; /* used in combination with alu_limit */
 
 	/* below fields are initialized once */
-- 
cgit v1.2.3


From 6018e1f407cccf39b804d1f75ad4de7be4e6cc45 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 8 Mar 2023 10:41:17 -0800
Subject: bpf: implement numbers iterator

Implement the first open-coded iterator type over a range of integers.

It's public API consists of:
  - bpf_iter_num_new() constructor, which accepts [start, end) range
    (that is, start is inclusive, end is exclusive).
  - bpf_iter_num_next() which will keep returning read-only pointer to int
    until the range is exhausted, at which point NULL will be returned.
    If bpf_iter_num_next() is kept calling after this, NULL will be
    persistently returned.
  - bpf_iter_num_destroy() destructor, which needs to be called at some
    point to clean up iterator state. BPF verifier enforces that iterator
    destructor is called at some point before BPF program exits.

Note that `start = end = X` is a valid combination to setup an empty
iterator. bpf_iter_num_new() will return 0 (success) for any such
combination.

If bpf_iter_num_new() detects invalid combination of input arguments, it
returns error, resets iterator state to, effectively, empty iterator, so
any subsequent call to bpf_iter_num_next() will keep returning NULL.

BPF verifier has no knowledge that returned integers are in the
[start, end) value range, as both `start` and `end` are not statically
known and enforced: they are runtime values.

While the implementation is pretty trivial, some care needs to be taken
to avoid overflows and underflows. Subsequent selftests will validate
correctness of [start, end) semantics, especially around extremes
(INT_MIN and INT_MAX).

Similarly to bpf_loop(), we enforce that no more than BPF_MAX_LOOPS can
be specified.

bpf_iter_num_{new,next,destroy}() is a logical evolution from bounded
BPF loops and bpf_loop() helper and is the basis for implementing
ergonomic BPF loops with no statically known or verified bounds.
Subsequent patches implement bpf_for() macro, demonstrating how this can
be wrapped into something that works and feels like a normal for() loop
in C language.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230308184121.1165081-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      | 8 ++++++--
 include/uapi/linux/bpf.h | 8 ++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6792a7940e1e..e64ff1e89fb2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1617,8 +1617,12 @@ struct bpf_array {
 #define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
 #define MAX_TAIL_CALL_CNT 33
 
-/* Maximum number of loops for bpf_loop */
-#define BPF_MAX_LOOPS	BIT(23)
+/* Maximum number of loops for bpf_loop and bpf_iter_num.
+ * It's enum to expose it (and thus make it discoverable) through BTF.
+ */
+enum {
+	BPF_MAX_LOOPS = 8 * 1024 * 1024,
+};
 
 #define BPF_F_ACCESS_MASK	(BPF_F_RDONLY |		\
 				 BPF_F_RDONLY_PROG |	\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 976b194eb775..4abddb668a10 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7112,4 +7112,12 @@ enum {
 	BPF_F_TIMER_ABS = (1ULL << 0),
 };
 
+/* BPF numbers iterator state */
+struct bpf_iter_num {
+	/* opaque iterator state; having __u64 here allows to preserve correct
+	 * alignment requirements in vmlinux.h, generated from BTF
+	 */
+	__u64 __opaque[1];
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3