From 215bf4962f6c9605710012fad222a5fec001b3ad Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 8 Mar 2023 10:41:15 -0800 Subject: bpf: add iterator kfuncs registration and validation logic Add ability to register kfuncs that implement BPF open-coded iterator contract and enforce naming and function proto convention. Enforcement happens at the time of kfunc registration and significantly simplifies the rest of iterators logic in the verifier. More details follow in subsequent patches, but we enforce the following conditions. All kfuncs (constructor, next, destructor) have to be named consistenly as bpf_iter__{new,next,destroy}(), respectively. represents iterator type, and iterator state should be represented as a matching `struct bpf_iter_` state type. Also, all iter kfuncs should have a pointer to this `struct bpf_iter_` as the very first argument. Additionally: - Constructor, i.e., bpf_iter__new(), can have arbitrary extra number of arguments. Return type is not enforced either. - Next method, i.e., bpf_iter__next(), has to return a pointer type and should have exactly one argument: `struct bpf_iter_ *` (const/volatile/restrict and typedefs are ignored). - Destructor, i.e., bpf_iter__destroy(), should return void and should have exactly one argument, similar to the next method. - struct bpf_iter_ size is enforced to be positive and a multiple of 8 bytes (to fit stack slots correctly). Such strictness and consistency allows to build generic helpers abstracting important, but boilerplate, details to be able to use open-coded iterators effectively and ergonomically (see bpf_for_each() in subsequent patches). It also simplifies the verifier logic in some places. At the same time, this doesn't hurt generality of possible iterator implementations. Win-win. Constructor kfunc is marked with a new KF_ITER_NEW flags, next method is marked with KF_ITER_NEXT (and should also have KF_RET_NULL, of course), while destructor kfunc is marked as KF_ITER_DESTROY. Additionally, we add a trivial kfunc name validation: it should be a valid non-NULL and non-empty string. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230308184121.1165081-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ include/linux/btf.h | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 18538bad2b8c..e2dc7f064449 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -59,6 +59,8 @@ struct bpf_active_lock { u32 id; }; +#define ITER_PREFIX "bpf_iter_" + struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; diff --git a/include/linux/btf.h b/include/linux/btf.h index 556b3e2e7471..1bba0827e8c4 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -71,6 +71,10 @@ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ #define KF_RCU (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */ +/* only one of KF_ITER_{NEW,NEXT,DESTROY} could be specified per kfunc */ +#define KF_ITER_NEW (1 << 8) /* kfunc implements BPF iter constructor */ +#define KF_ITER_NEXT (1 << 9) /* kfunc implements BPF iter next method */ +#define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */ /* * Tag marking a kernel function as a kfunc. This is meant to minimize the -- cgit v1.2.3 From 06accc8779c1d558a5b5a21f2ac82b0c95827ddd Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 8 Mar 2023 10:41:16 -0800 Subject: bpf: add support for open-coded iterator loops Teach verifier about the concept of the open-coded (or inline) iterators. This patch adds generic iterator loop verification logic, new STACK_ITER stack slot type to contain iterator state, and necessary kfunc plumbing for iterator's constructor, destructor and next methods. Next patch implements first specific iterator (numbers iterator for implementing for() loop logic). Such split allows to have more focused commits for verifier logic and separate commit that we could point later to demonstrating what does it take to add a new kind of iterator. Each kind of iterator has its own associated struct bpf_iter_, where denotes a specific type of iterator. struct bpf_iter_ state is supposed to live on BPF program stack, so there will be no way to change its size later on without breaking backwards compatibility, so choose wisely! But given this struct is specific to a given of iterator, this allows a lot of flexibility: simple iterators could be fine with just one stack slot (8 bytes), like numbers iterator in the next patch, while some other more complicated iterators might need way more to keep their iterator state. Either way, such design allows to avoid runtime memory allocations, which otherwise would be necessary if we fixed on-the-stack size and it turned out to be too small for a given iterator implementation. The way BPF verifier logic is implemented, there are no artificial restrictions on a number of active iterators, it should work correctly using multiple active iterators at the same time. This also means you can have multiple nested iteration loops. struct bpf_iter_ reference can be safely passed to subprograms as well. General flow is easiest to demonstrate with a simple example using number iterator implemented in next patch. Here's the simplest possible loop: struct bpf_iter_num it; int *v; bpf_iter_num_new(&it, 2, 5); while ((v = bpf_iter_num_next(&it))) { bpf_printk("X = %d", *v); } bpf_iter_num_destroy(&it); Above snippet should output "X = 2", "X = 3", "X = 4". Note that 5 is exclusive and is not returned. This matches similar APIs (e.g., slices in Go or Rust) that implement a range of elements, where end index is non-inclusive. In the above example, we see a trio of function: - constructor, bpf_iter_num_new(), which initializes iterator state (struct bpf_iter_num it) on the stack. If any of the input arguments are invalid, constructor should make sure to still initialize it such that subsequent bpf_iter_num_next() calls will return NULL. I.e., on error, return error and construct empty iterator. - next method, bpf_iter_num_next(), which accepts pointer to iterator state and produces an element. Next method should always return a pointer. The contract between BPF verifier is that next method will always eventually return NULL when elements are exhausted. Once NULL is returned, subsequent next calls should keep returning NULL. In the case of numbers iterator, bpf_iter_num_next() returns a pointer to an int (storage for this integer is inside the iterator state itself), which can be dereferenced after corresponding NULL check. - once done with the iterator, it's mandated that user cleans up its state with the call to destructor, bpf_iter_num_destroy() in this case. Destructor frees up any resources and marks stack space used by struct bpf_iter_num as usable for something else. Any other iterator implementation will have to implement at least these three methods. It is enforced that for any given type of iterator only applicable constructor/destructor/next are callable. I.e., verifier ensures you can't pass number iterator state into, say, cgroup iterator's next method. It is important to keep the naming pattern consistent to be able to create generic macros to help with BPF iter usability. E.g., one of the follow up patches adds generic bpf_for_each() macro to bpf_misc.h in selftests, which allows to utilize iterator "trio" nicely without having to code the above somewhat tedious loop explicitly every time. This is enforced at kfunc registration point by one of the previous patches in this series. At the implementation level, iterator state tracking for verification purposes is very similar to dynptr. We add STACK_ITER stack slot type, reserve necessary number of slots, depending on sizeof(struct bpf_iter_), and keep track of necessary extra state in the "main" slot, which is marked with non-zero ref_obj_id. Other slots are also marked as STACK_ITER, but have zero ref_obj_id. This is simpler than having a separate "is_first_slot" flag. Another big distinction is that STACK_ITER is *always refcounted*, which simplifies implementation without sacrificing usability. So no need for extra "iter_id", no need to anticipate reuse of STACK_ITER slots for new constructors, etc. Keeping it simple here. As far as the verification logic goes, there are two extensive comments: in process_iter_next_call() and iter_active_depths_differ() explaining some important and sometimes subtle aspects. Please refer to them for details. But from 10,000-foot point of view, next methods are the points of forking a verification state, which are conceptually similar to what verifier is doing when validating conditional jump. We branch out at a `call bpf_iter__next` instruction and simulate two outcomes: NULL (iteration is done) and non-NULL (new element is returned). NULL is simulated first and is supposed to reach exit without looping. After that non-NULL case is validated and it either reaches exit (for trivial examples with no real loop), or reaches another `call bpf_iter__next` instruction with the state equivalent to already (partially) validated one. State equivalency at that point means we technically are going to be looping forever without "breaking out" out of established "state envelope" (i.e., subsequent iterations don't add any new knowledge or constraints to the verifier state, so running 1, 2, 10, or a million of them doesn't matter). But taking into account the contract stating that iterator next method *has to* return NULL eventually, we can conclude that loop body is safe and will eventually terminate. Given we validated logic outside of the loop (NULL case), and concluded that loop body is safe (though potentially looping many times), verifier can claim safety of the overall program logic. The rest of the patch is necessary plumbing for state tracking, marking, validation, and necessary further kfunc plumbing to allow implementing iterator constructor, destructor, and next methods. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230308184121.1165081-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e2dc7f064449..0c052bc79940 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -61,6 +61,12 @@ struct bpf_active_lock { #define ITER_PREFIX "bpf_iter_" +enum bpf_iter_state { + BPF_ITER_STATE_INVALID, /* for non-first slot */ + BPF_ITER_STATE_ACTIVE, + BPF_ITER_STATE_DRAINED, +}; + struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; @@ -105,6 +111,18 @@ struct bpf_reg_state { bool first_slot; } dynptr; + /* For bpf_iter stack slots */ + struct { + /* BTF container and BTF type ID describing + * struct bpf_iter_ of an iterator state + */ + struct btf *btf; + u32 btf_id; + /* packing following two fields to fit iter state into 16 bytes */ + enum bpf_iter_state state:2; + int depth:30; + } iter; + /* Max size from any of the above. */ struct { unsigned long raw1; @@ -143,6 +161,8 @@ struct bpf_reg_state { * same reference to the socket, to determine proper reference freeing. * For stack slots that are dynptrs, this is used to track references to * the dynptr to determine proper reference freeing. + * Similarly to dynptrs, we use ID to track "belonging" of a reference + * to a specific instance of bpf_iter. */ u32 id; /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned @@ -213,9 +233,11 @@ enum bpf_stack_slot_type { * is stored in bpf_stack_state->spilled_ptr.dynptr.type */ STACK_DYNPTR, + STACK_ITER, }; #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ + #define BPF_DYNPTR_SIZE sizeof(struct bpf_dynptr_kern) #define BPF_DYNPTR_NR_SLOTS (BPF_DYNPTR_SIZE / BPF_REG_SIZE) @@ -450,6 +472,7 @@ struct bpf_insn_aux_data { bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ bool zext_dst; /* this insn zero extends dst reg */ bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ + bool is_iter_next; /* bpf_iter__next() kfunc call */ u8 alu_state; /* used in combination with alu_limit */ /* below fields are initialized once */ -- cgit v1.2.3 From 6018e1f407cccf39b804d1f75ad4de7be4e6cc45 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 8 Mar 2023 10:41:17 -0800 Subject: bpf: implement numbers iterator Implement the first open-coded iterator type over a range of integers. It's public API consists of: - bpf_iter_num_new() constructor, which accepts [start, end) range (that is, start is inclusive, end is exclusive). - bpf_iter_num_next() which will keep returning read-only pointer to int until the range is exhausted, at which point NULL will be returned. If bpf_iter_num_next() is kept calling after this, NULL will be persistently returned. - bpf_iter_num_destroy() destructor, which needs to be called at some point to clean up iterator state. BPF verifier enforces that iterator destructor is called at some point before BPF program exits. Note that `start = end = X` is a valid combination to setup an empty iterator. bpf_iter_num_new() will return 0 (success) for any such combination. If bpf_iter_num_new() detects invalid combination of input arguments, it returns error, resets iterator state to, effectively, empty iterator, so any subsequent call to bpf_iter_num_next() will keep returning NULL. BPF verifier has no knowledge that returned integers are in the [start, end) value range, as both `start` and `end` are not statically known and enforced: they are runtime values. While the implementation is pretty trivial, some care needs to be taken to avoid overflows and underflows. Subsequent selftests will validate correctness of [start, end) semantics, especially around extremes (INT_MIN and INT_MAX). Similarly to bpf_loop(), we enforce that no more than BPF_MAX_LOOPS can be specified. bpf_iter_num_{new,next,destroy}() is a logical evolution from bounded BPF loops and bpf_loop() helper and is the basis for implementing ergonomic BPF loops with no statically known or verified bounds. Subsequent patches implement bpf_for() macro, demonstrating how this can be wrapped into something that works and feels like a normal for() loop in C language. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230308184121.1165081-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6792a7940e1e..e64ff1e89fb2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1617,8 +1617,12 @@ struct bpf_array { #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 33 -/* Maximum number of loops for bpf_loop */ -#define BPF_MAX_LOOPS BIT(23) +/* Maximum number of loops for bpf_loop and bpf_iter_num. + * It's enum to expose it (and thus make it discoverable) through BTF. + */ +enum { + BPF_MAX_LOOPS = 8 * 1024 * 1024, +}; #define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ BPF_F_RDONLY_PROG | \ -- cgit v1.2.3 From 4b5ce570dbef57a20acdd71b0c65376009012354 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Mar 2023 22:01:49 -0800 Subject: bpf: ensure state checkpointing at iter_next() call sites State equivalence check and checkpointing performed in is_state_visited() employs certain heuristics to try to save memory by avoiding state checkpoints if not enough jumps and instructions happened since last checkpoint. This leads to unpredictability of whether a particular instruction will be checkpointed and how regularly. While normally this is not causing much problems (except inconveniences for predictable verifier tests, which we overcome with BPF_F_TEST_STATE_FREQ flag), turns out it's not the case for open-coded iterators. Checking and saving state checkpoints at iter_next() call is crucial for fast convergence of open-coded iterator loop logic, so we need to force it. If we don't do that, is_state_visited() might skip saving a checkpoint, causing unnecessarily long sequence of not checkpointed instructions and jumps, leading to exhaustion of jump history buffer, and potentially other undesired outcomes. It is expected that with correct open-coded iterators convergence will happen quickly, so we don't run a risk of exhausting memory. This patch adds, in addition to prune and jump instruction marks, also a "forced checkpoint" mark, and makes sure that any iter_next() call instruction is marked as such. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230310060149.625887-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0c052bc79940..81d525d057c7 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -477,8 +477,12 @@ struct bpf_insn_aux_data { /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ - bool prune_point; bool jmp_point; + bool prune_point; + /* ensure we check state equivalence and save state checkpoint and + * this instruction, regardless of any heuristics + */ + bool force_checkpoint; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ -- cgit v1.2.3 From 4cbd23cc92c49173e402753cab62b8a7754ed18f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:20 -0800 Subject: bpf: Move a few bpf_local_storage functions to static scope This patch moves the bpf_local_storage_free_rcu() and bpf_selem_unlink_map() to static because they are not used outside of bpf_local_storage.c. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index d934248b8e81..502ad7093f13 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -147,8 +147,6 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu); void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem); -void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem); - struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, bool charge_mem, gfp_t gfp_flags); @@ -163,7 +161,6 @@ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags, gfp_t gfp_flags); -void bpf_local_storage_free_rcu(struct rcu_head *rcu); u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map); #endif /* _BPF_LOCAL_STORAGE_H */ -- cgit v1.2.3 From 2ffcb6fc50174d1efc8f98633eb2647d84483c68 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:21 -0800 Subject: bpf: Refactor codes into bpf_local_storage_destroy This patch first renames bpf_local_storage_unlink_nolock to bpf_local_storage_destroy(). It better reflects that it is only used when the storage's owner (sk/task/cgrp/inode) is being kfree(). All bpf_local_storage_destroy's caller is taking the spin lock and then free the storage. This patch also moves these two steps into the bpf_local_storage_destroy. This is a preparation work for a later patch that uses bpf_mem_cache_alloc/free in the bpf_local_storage. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 502ad7093f13..5908a954ddc2 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -128,7 +128,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool cacheit_lockit); -bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage); +void bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache, -- cgit v1.2.3 From fc6652aab6ad545de70b772550da9043d0b47f1c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:24 -0800 Subject: bpf: Remember smap in bpf_local_storage This patch remembers which smap triggers the allocation of a 'struct bpf_local_storage' object. The local_storage is allocated during the very first selem added to the owner. The smap pointer is needed when using the bpf_mem_cache_free in a later patch because it needs to free to the correct smap's bpf_mem_alloc object. When a selem is being removed, it needs to check if it is the selem that triggers the creation of the local_storage. If it is, the local_storage->smap pointer will be reset to NULL. This NULL reset is done under the local_storage->lock in bpf_selem_unlink_storage_nolock() when a selem is being removed. Also note that the local_storage may not go away even local_storage->smap is NULL because there may be other selem still stored in the local_storage. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-6-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 5908a954ddc2..613b1805ed9f 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -83,6 +83,7 @@ struct bpf_local_storage_elem { struct bpf_local_storage { struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE]; + struct bpf_local_storage_map __rcu *smap; struct hlist_head list; /* List of bpf_local_storage_elem */ void *owner; /* The object that owns the above "list" of * bpf_local_storage_elem. -- cgit v1.2.3 From a47eabf216f77cb6f22ceb38d46f1bb95968579c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:25 -0800 Subject: bpf: Repurpose use_trace_rcu to reuse_now in bpf_local_storage This patch re-purpose the use_trace_rcu to mean if the freed memory can be reused immediately or not. The use_trace_rcu is renamed to reuse_now. Other than the boolean test is reversed, it should be a no-op. The following explains the reason for the rename and how it will be used in a later patch. In a later patch, bpf_mem_cache_alloc/free will be used in the bpf_local_storage. The bpf mem allocator will reuse the freed memory immediately. Some of the free paths in bpf_local_storage does not support memory to be reused immediately. These paths are the "delete" elem cases from the bpf_*_storage_delete() helper and the map_delete_elem() syscall. Note that "delete" elem before the owner's (sk/task/cgrp/inode) lifetime ended is not the common usage for the local storage. The common free path, bpf_local_storage_destroy(), can reuse the memory immediately. This common path means the storage stays with its owner until the owner is destroyed. The above mentioned "delete" elem paths that cannot reuse immediately always has the 'use_trace_rcu == true'. The cases that is safe for immediate reuse always have 'use_trace_rcu == false'. Instead of adding another arg in a later patch, this patch re-purpose this arg to reuse_now and have the test logic reversed. In a later patch, 'reuse_now == true' will free to the bpf_mem_cache_free() where the memory can be reused immediately. 'reuse_now == false' will go through the call_rcu_tasks_trace(). Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-7-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 613b1805ed9f..18a31add2255 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -143,7 +143,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); -void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu); +void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem); -- cgit v1.2.3 From c0d63f309186d8492577c67c67984c714b6b72bc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:28 -0800 Subject: bpf: Add bpf_selem_free() This patch refactors the selem freeing logic into bpf_selem_free(). It is a preparation work for a later patch using bpf_mem_cache_alloc/free. The other kfree(selem) cases are also changed to bpf_selem_free(..., reuse_now = true). Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-10-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 18a31add2255..a34f61467a2f 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -152,6 +152,10 @@ struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, bool charge_mem, gfp_t gfp_flags); +void bpf_selem_free(struct bpf_local_storage_elem *selem, + struct bpf_local_storage_map *smap, + bool reuse_now); + int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, -- cgit v1.2.3 From 74843b57ec70af7b67b7e6153374834ee18d139f Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 9 Mar 2023 10:01:08 -0800 Subject: bpf: Change btf_record_find enum parameter to field_mask btf_record_find's 3rd parameter can be multiple enum btf_field_type's masked together. The function is called with BPF_KPTR in two places in verifier.c, so it works with masked values already. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230309180111.1618459-4-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e64ff1e89fb2..3a38db315f7f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1925,7 +1925,7 @@ void bpf_prog_free_id(struct bpf_prog *prog); void bpf_map_free_id(struct bpf_map *map); struct btf_field *btf_record_find(const struct btf_record *rec, - u32 offset, enum btf_field_type type); + u32 offset, u32 field_mask); void btf_record_free(struct btf_record *rec); void bpf_map_free_record(struct bpf_map *map); struct btf_record *btf_record_dup(const struct btf_record *rec); -- cgit v1.2.3 From c8e18754091479fac3f5b6c053c6bc4be0b7fb11 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Fri, 10 Mar 2023 15:07:41 -0800 Subject: bpf: Support __kptr to local kptrs If a PTR_TO_BTF_ID type comes from program BTF - not vmlinux or module BTF - it must have been allocated by bpf_obj_new and therefore must be free'd with bpf_obj_drop. Such a PTR_TO_BTF_ID is considered a "local kptr" and is tagged with MEM_ALLOC type tag by bpf_obj_new. This patch adds support for treating __kptr-tagged pointers to "local kptrs" as having an implicit bpf_obj_drop destructor for referenced kptr acquire / release semantics. Consider the following example: struct node_data { long key; long data; struct bpf_rb_node node; }; struct map_value { struct node_data __kptr *node; }; struct { __uint(type, BPF_MAP_TYPE_ARRAY); __type(key, int); __type(value, struct map_value); __uint(max_entries, 1); } some_nodes SEC(".maps"); If struct node_data had a matching definition in kernel BTF, the verifier would expect a destructor for the type to be registered. Since struct node_data does not match any type in kernel BTF, the verifier knows that there is no kfunc that provides a PTR_TO_BTF_ID to this type, and that such a PTR_TO_BTF_ID can only come from bpf_obj_new. So instead of searching for a registered dtor, a bpf_obj_drop dtor can be assumed. This allows the runtime to properly destruct such kptrs in bpf_obj_free_fields, which enables maps to clean up map_vals w/ such kptrs when going away. Implementation notes: * "kernel_btf" variable is renamed to "kptr_btf" in btf_parse_kptr. Before this patch, the variable would only ever point to vmlinux or module BTFs, but now it can point to some program BTF for local kptr type. It's later used to populate the (btf, btf_id) pair in kptr btf field. * It's necessary to btf_get the program BTF when populating btf_field for local kptr. btf_record_free later does a btf_put. * Behavior for non-local referenced kptrs is not modified, as bpf_find_btf_id helper only searches vmlinux and module BTFs for matching BTF type. If such a type is found, btf_field_kptr's btf will pass btf_is_kernel check, and the associated release function is some one-argument dtor. If btf_is_kernel check fails, associated release function is two-arg bpf_obj_drop_impl. Before this patch only btf_field_kptr's w/ kernel or module BTFs were created. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230310230743.2320707-2-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 ++++++++++- include/linux/btf.h | 2 -- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3a38db315f7f..756b85f0d0d3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -189,10 +189,19 @@ enum btf_field_type { BPF_RB_NODE | BPF_RB_ROOT, }; +typedef void (*btf_dtor_kfunc_t)(void *); +typedef void (*btf_dtor_obj_drop)(void *, const struct btf_record *); + struct btf_field_kptr { struct btf *btf; struct module *module; - btf_dtor_kfunc_t dtor; + union { + /* dtor used if btf_is_kernel(btf), otherwise the type + * is program-allocated and obj_drop is used + */ + btf_dtor_kfunc_t dtor; + btf_dtor_obj_drop obj_drop; + }; u32 btf_id; }; diff --git a/include/linux/btf.h b/include/linux/btf.h index 1bba0827e8c4..d53b10cc55f2 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -121,8 +121,6 @@ struct btf_struct_metas { struct btf_struct_meta types[]; }; -typedef void (*btf_dtor_kfunc_t)(void *); - extern const struct file_operations btf_fops; void btf_get(struct btf *btf); -- cgit v1.2.3 From 22df776a9a866713d9decfb92b633bcfdb571954 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 15 Feb 2023 17:30:33 -0600 Subject: tasks: Extract rcu_users out of union In commit 3fbd7ee285b2b ("tasks: Add a count of task RCU users"), a count on the number of RCU users was added to struct task_struct. This was done so as to enable the removal of task_rcu_dereference(), and allow tasks to be protected by RCU even after exiting and being removed from the runqueue. In this commit, the 'refcount_t rcu_users' field that keeps track of this refcount was put into a union co-located with 'struct rcu_head rcu', so as to avoid taking up any extra space in task_struct. This was possible to do safely, because the field was only ever decremented by a static set of specific callers, and then never incremented again. While this restriction of there only being a small, static set of users of this field has worked fine, it prevents us from leveraging the field to use RCU to protect tasks in other contexts. During tracing, for example, it would be useful to be able to collect some tasks that performed a certain operation, put them in a map, and then periodically summarize who they are, which cgroup they're in, how much CPU time they've utilized, etc. While this can currently be done with 'usage', it becomes tricky when a task is already in a map, or if a reference should only be taken if a task is valid and will not soon be reaped. Ideally, we could do something like pass a reference to a map value, and then try to acquire a reference to the task in an RCU read region by using refcount_inc_not_zero(). Similarly, in sched_ext, schedulers are using integer pids to remember tasks, and then looking them up with find_task_by_pid_ns(). This is slow, error prone, and adds complexity. It would be more convenient and performant if BPF schedulers could instead store tasks directly in maps, and then leverage RCU to ensure they can be safely accessed with low overhead. Finally, overloading fields like this is error prone. Someone that wants to use 'rcu_users' could easily overlook the fact that once the rcu callback is scheduled, the refcount will go back to being nonzero, thus precluding the use of refcount_inc_not_zero(). Furthermore, as described below, it's possible to extract the fields of the union without changing the size of task_struct. There are several possible ways to enable this: 1. The lightest touch approach is likely the one proposed in this patch, which is to simply extract 'rcu_users' and 'rcu' from the union, so that scheduling the 'rcu' callback doesn't overwrite the 'rcu_users' refcount. If we have a trusted task pointer, this would allow us to use refcnt_inc_not_zero() inside of an RCU region to determine if we can safely acquire a reference to the task and store it in a map. As mentioned below, this can be done without changing the size of task_struct, by moving the location of the union to another location that has padding gaps we can fill in. 2. Removing 'refcount_t rcu_users', and instead having the entire task be freed in an rcu callback. This is likely the most sound overall design, though it changes the behavioral semantics exposed to callers, who currently expect that a task that's successfully looked up in e.g. the pid_list with find_task_by_pid_ns(), can always have a 'usage' reference acquired on them, as it's guaranteed to be > 0 until after the next gp. In order for this approach to work, we'd have to audit all callers. This approach also slightly changes behavior observed by user space by not invoking trace_sched_process_free() until the whole task_struct is actually being freed, rather than just after it's exited. It also may change timings, as memory will be freed in an RCU callback rather than immediately when the final 'usage' refcount drops to 0. This also is arguably a benefit, as it provides more predictable performance to callers who are refcounting tasks. 3. There may be other solutions as well that don't require changing the layout of task_struct. For example, we could possibly do something complex from the BPF side, such as listen for task exit and remove a task from a map when the task is exiting. This would likely require significant custom handling for task_struct in the verifier, so a more generalizable solution is likely warranted. As mentioned above, this patch proposes the lightest-touch approach which allows callers elsewhere in the kernel to use 'rcu_users' to ensure the lifetime of a task, by extracting 'rcu_users' and 'rcu' from the union. There is no size change in task_struct with this patch. Cc: Linus Torvalds Cc: Eric W. Biederman Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: David Vernet Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20230215233033.889644-1-void@manifault.com Signed-off-by: Alexei Starovoitov --- include/linux/sched.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 63d242164b1a..b11b4517760f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1318,11 +1318,6 @@ struct task_struct { struct tlbflush_unmap_batch tlb_ubc; - union { - refcount_t rcu_users; - struct rcu_head rcu; - }; - /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; @@ -1459,6 +1454,8 @@ struct task_struct { unsigned long saved_state_change; # endif #endif + struct rcu_head rcu; + refcount_t rcu_users; int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; -- cgit v1.2.3 From 9e36a204bd43553a9cd4bd574612cd9a5df791ea Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 13 Mar 2023 14:46:41 -0700 Subject: bpf: Disable migration when freeing stashed local kptr using obj drop When a local kptr is stashed in a map and freed when the map goes away, currently an error like the below appears: [ 39.195695] BUG: using smp_processor_id() in preemptible [00000000] code: kworker/u32:15/2875 [ 39.196549] caller is bpf_mem_free+0x56/0xc0 [ 39.196958] CPU: 15 PID: 2875 Comm: kworker/u32:15 Tainted: G O 6.2.0-13016-g22df776a9a86 #4477 [ 39.197897] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [ 39.198949] Workqueue: events_unbound bpf_map_free_deferred [ 39.199470] Call Trace: [ 39.199703] [ 39.199911] dump_stack_lvl+0x60/0x70 [ 39.200267] check_preemption_disabled+0xbf/0xe0 [ 39.200704] bpf_mem_free+0x56/0xc0 [ 39.201032] ? bpf_obj_new_impl+0xa0/0xa0 [ 39.201430] bpf_obj_free_fields+0x1cd/0x200 [ 39.201838] array_map_free+0xad/0x220 [ 39.202193] ? finish_task_switch+0xe5/0x3c0 [ 39.202614] bpf_map_free_deferred+0xea/0x210 [ 39.203006] ? lockdep_hardirqs_on_prepare+0xe/0x220 [ 39.203460] process_one_work+0x64f/0xbe0 [ 39.203822] ? pwq_dec_nr_in_flight+0x110/0x110 [ 39.204264] ? do_raw_spin_lock+0x107/0x1c0 [ 39.204662] ? lockdep_hardirqs_on_prepare+0xe/0x220 [ 39.205107] worker_thread+0x74/0x7a0 [ 39.205451] ? process_one_work+0xbe0/0xbe0 [ 39.205818] kthread+0x171/0x1a0 [ 39.206111] ? kthread_complete_and_exit+0x20/0x20 [ 39.206552] ret_from_fork+0x1f/0x30 [ 39.206886] This happens because the call to __bpf_obj_drop_impl I added in the patch adding support for stashing local kptrs doesn't disable migration. Prior to that patch, __bpf_obj_drop_impl logic only ran when called by a BPF progarm, whereas now it can be called from map free path, so it's necessary to explicitly disable migration. Also, refactor a bit to just call __bpf_obj_drop_impl directly instead of bothering w/ dtor union and setting pointer-to-obj_drop. Fixes: c8e187540914 ("bpf: Support __kptr to local kptrs") Reported-by: Alexei Starovoitov Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230313214641.3731908-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 756b85f0d0d3..71cc92a4ba48 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -190,18 +190,14 @@ enum btf_field_type { }; typedef void (*btf_dtor_kfunc_t)(void *); -typedef void (*btf_dtor_obj_drop)(void *, const struct btf_record *); struct btf_field_kptr { struct btf *btf; struct module *module; - union { - /* dtor used if btf_is_kernel(btf), otherwise the type - * is program-allocated and obj_drop is used - */ - btf_dtor_kfunc_t dtor; - btf_dtor_obj_drop obj_drop; - }; + /* dtor used if btf_is_kernel(btf), otherwise the type is + * program-allocated, dtor is NULL, and __bpf_obj_drop_impl is used + */ + btf_dtor_kfunc_t dtor; u32 btf_id; }; -- cgit v1.2.3 From 2c854e5fcd7e243f5a7cf6a6afa0ef83060c903c Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Mon, 13 Mar 2023 22:55:51 +0100 Subject: net: page_pool, skbuff: make skb_mark_for_recycle() always available skb_mark_for_recycle() is guarded with CONFIG_PAGE_POOL, this creates unneeded complication when using it in the generic code. For now, it's only used in the drivers always selecting Page Pool, so this works. Move the guards so that preprocessor will cut out only the operation itself and the function will still be a noop on !PAGE_POOL systems, but available there as well. No functional changes. Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202303020342.Wi2PRFFH-lkp@intel.com Signed-off-by: Alexander Lobakin Link: https://lore.kernel.org/r/20230313215553.1045175-3-aleksander.lobakin@intel.com Signed-off-by: Alexei Starovoitov --- include/linux/skbuff.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fe661011644b..3f3a2a82a86b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -5069,12 +5069,12 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb) #endif } -#ifdef CONFIG_PAGE_POOL static inline void skb_mark_for_recycle(struct sk_buff *skb) { +#ifdef CONFIG_PAGE_POOL skb->pp_recycle = 1; -} #endif +} #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ -- cgit v1.2.3 From 31bf1dbccfb0a9861d4846755096b3fff5687f8a Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Fri, 10 Mar 2023 08:40:59 +0100 Subject: bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules This resolves two problems with attachment of fentry/fexit/fmod_ret/lsm to functions located in modules: 1. The verifier tries to find the address to attach to in kallsyms. This is always done by searching the entire kallsyms, not respecting the module in which the function is located. Such approach causes an incorrect attachment address to be computed if the function to attach to is shadowed by a function of the same name located earlier in kallsyms. 2. If the address to attach to is located in a module, the module reference is only acquired in register_fentry. If the module is unloaded between the place where the address is found (bpf_check_attach_target in the verifier) and register_fentry, it is possible that another module is loaded to the same address which may lead to potential errors. Since the attachment must contain the BTF of the program to attach to, we extract the module from it and search for the function address in the correct module (resolving problem no. 1). Then, the module reference is taken directly in bpf_check_attach_target and stored in the bpf program (in bpf_prog_aux). The reference is only released when the program is unloaded (resolving problem no. 2). Signed-off-by: Viktor Malik Acked-by: Jiri Olsa Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/3f6a9d8ae850532b5ef864ef16327b0f7a669063.1678432753.git.vmalik@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 71cc92a4ba48..3ef98fb92987 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1103,6 +1103,7 @@ struct bpf_trampoline { struct bpf_attach_target_info { struct btf_func_model fmodel; long tgt_addr; + struct module *tgt_mod; const char *tgt_name; const struct btf_type *tgt_type; }; @@ -1406,6 +1407,7 @@ struct bpf_prog_aux { * main prog always has linfo_idx == 0 */ u32 linfo_idx; + struct module *mod; u32 num_exentries; struct exception_table_entry *extable; union { -- cgit v1.2.3 From bd5314f8dd2d41330eecb60f0490c3fcfe1fc99d Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Fri, 17 Mar 2023 10:56:01 +0100 Subject: kallsyms, bpf: Move find_kallsyms_symbol_value out of internal header Moving find_kallsyms_symbol_value from kernel/module/internal.h to include/linux/module.h. The reason is that internal.h is not prepared to be included when CONFIG_MODULES=n. find_kallsyms_symbol_value is used by kernel/bpf/verifier.c and including internal.h from it (without modules) leads into a compilation error: In file included from ../include/linux/container_of.h:5, from ../include/linux/list.h:5, from ../include/linux/timer.h:5, from ../include/linux/workqueue.h:9, from ../include/linux/bpf.h:10, from ../include/linux/bpf-cgroup.h:5, from ../kernel/bpf/verifier.c:7: ../kernel/bpf/../module/internal.h: In function 'mod_find': ../include/linux/container_of.h:20:54: error: invalid use of undefined type 'struct module' 20 | static_assert(__same_type(*(ptr), ((type *)0)->member) || \ | ^~ [...] This patch fixes the above error. Fixes: 31bf1dbccfb0 ("bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules") Reported-by: kernel test robot Signed-off-by: Viktor Malik Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/oe-kbuild-all/202303161404.OrmfCy09-lkp@intel.com/ Link: https://lore.kernel.org/bpf/20230317095601.386738-1-vmalik@redhat.com --- include/linux/module.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 4435ad9439ab..41cfd3be57e5 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -616,6 +616,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, /* Look for this name: can be of form module:name. */ unsigned long module_kallsyms_lookup_name(const char *name); +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name); + extern void __noreturn __module_put_and_kthread_exit(struct module *mod, long code); #define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code) @@ -796,6 +798,12 @@ static inline unsigned long module_kallsyms_lookup_name(const char *name) return 0; } +static inline unsigned long find_kallsyms_symbol_value(struct module *mod, + const char *name) +{ + return 0; +} + static inline int register_module_notifier(struct notifier_block *nb) { /* no events will happen anyway, so this can always succeed */ -- cgit v1.2.3 From 04aae213e719ec2bb310158c4025316ace50589b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Mar 2023 18:41:13 -0700 Subject: net: skbuff: rename __pkt_vlan_present_offset to __mono_tc_offset vlan_present is gone since commit 354259fa73e2 ("net: remove skb->vlan_present") rename the offset field to what BPF is currently looking for in this byte - mono_delivery_time and tc_at_ingress. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230321014115.997841-2-kuba@kernel.org Signed-off-by: Martin KaFai Lau --- include/linux/skbuff.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3f3a2a82a86b..5a63878a4550 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -955,7 +955,7 @@ struct sk_buff { __u8 csum_valid:1; /* private: */ - __u8 __pkt_vlan_present_offset[0]; + __u8 __mono_tc_offset[0]; /* public: */ __u8 remcsum_offload:1; __u8 csum_complete_sw:1; @@ -1078,7 +1078,7 @@ struct sk_buff { #define TC_AT_INGRESS_MASK (1 << 7) #define SKB_MONO_DELIVERY_TIME_MASK (1 << 5) #endif -#define PKT_VLAN_PRESENT_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset) +#define SKB_BF_MONO_TC_OFFSET offsetof(struct sk_buff, __mono_tc_offset) #ifdef __KERNEL__ /* -- cgit v1.2.3 From b94e032b7ad6318b36615f0e0cc3b0d61a5531e8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Mar 2023 18:41:14 -0700 Subject: net: skbuff: reorder bytes 2 and 3 of the bitfield BPF needs to know the offsets of fields it tries to access. Zero-length fields are added to make offsetof() work. This unfortunately partitions the bitfield (fields across the zero-length members can't be coalesced). Reorder bytes 2 and 3, BPF needs to know the offset of fields previously in byte 3 and some fields in byte 2 should really be optional. The two bytes are always in the same cacheline so it should not matter. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230321014115.997841-3-kuba@kernel.org Signed-off-by: Martin KaFai Lau --- include/linux/skbuff.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5a63878a4550..36d31e74db37 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -944,16 +944,6 @@ struct sk_buff { __u8 ip_summed:2; __u8 ooo_okay:1; - __u8 l4_hash:1; - __u8 sw_hash:1; - __u8 wifi_acked_valid:1; - __u8 wifi_acked:1; - __u8 no_fcs:1; - /* Indicates the inner headers are valid in the skbuff. */ - __u8 encapsulation:1; - __u8 encap_hdr_csum:1; - __u8 csum_valid:1; - /* private: */ __u8 __mono_tc_offset[0]; /* public: */ @@ -966,6 +956,16 @@ struct sk_buff { __u8 tc_skip_classify:1; __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */ #endif + + __u8 l4_hash:1; + __u8 sw_hash:1; + __u8 wifi_acked_valid:1; + __u8 wifi_acked:1; + __u8 no_fcs:1; + /* Indicates the inner headers are valid in the skbuff. */ + __u8 encapsulation:1; + __u8 encap_hdr_csum:1; + __u8 csum_valid:1; #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif -- cgit v1.2.3 From c0ba861117c3e8deb03855d7dc5a7717958bbb18 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Mar 2023 18:41:15 -0700 Subject: net: skbuff: move the fields BPF cares about directly next to the offset marker To avoid more possible BPF dependencies with moving bitfields around keep the fields BPF cares about right next to the offset marker. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230321014115.997841-4-kuba@kernel.org Signed-off-by: Martin KaFai Lau --- include/linux/skbuff.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 36d31e74db37..6aeb0e7b9511 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -947,15 +947,15 @@ struct sk_buff { /* private: */ __u8 __mono_tc_offset[0]; /* public: */ - __u8 remcsum_offload:1; - __u8 csum_complete_sw:1; - __u8 csum_level:2; - __u8 dst_pending_confirm:1; __u8 mono_delivery_time:1; /* See SKB_MONO_DELIVERY_TIME_MASK */ #ifdef CONFIG_NET_CLS_ACT - __u8 tc_skip_classify:1; __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */ + __u8 tc_skip_classify:1; #endif + __u8 remcsum_offload:1; + __u8 csum_complete_sw:1; + __u8 csum_level:2; + __u8 dst_pending_confirm:1; __u8 l4_hash:1; __u8 sw_hash:1; @@ -1072,11 +1072,11 @@ struct sk_buff { * around, you also must adapt these constants. */ #ifdef __BIG_ENDIAN_BITFIELD -#define TC_AT_INGRESS_MASK (1 << 0) -#define SKB_MONO_DELIVERY_TIME_MASK (1 << 2) +#define SKB_MONO_DELIVERY_TIME_MASK (1 << 7) +#define TC_AT_INGRESS_MASK (1 << 6) #else -#define TC_AT_INGRESS_MASK (1 << 7) -#define SKB_MONO_DELIVERY_TIME_MASK (1 << 5) +#define SKB_MONO_DELIVERY_TIME_MASK (1 << 0) +#define TC_AT_INGRESS_MASK (1 << 1) #endif #define SKB_BF_MONO_TC_OFFSET offsetof(struct sk_buff, __mono_tc_offset) -- cgit v1.2.3 From d7ba4cc900bf1eea2d8c807c6b1fc6bd61f41237 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Wed, 22 Mar 2023 12:47:54 -0700 Subject: bpf: return long from bpf_map_ops funcs This patch changes the return types of bpf_map_ops functions to long, where previously int was returned. Using long allows for bpf programs to maintain the sign bit in the absence of sign extension during situations where inlined bpf helper funcs make calls to the bpf_map_ops funcs and a negative error is returned. The definitions of the helper funcs are generated from comments in the bpf uapi header at `include/uapi/linux/bpf.h`. The return type of these helpers was previously changed from int to long in commit bdb7b79b4ce8. For any case where one of the map helpers call the bpf_map_ops funcs that are still returning 32-bit int, a compiler might not include sign extension instructions to properly convert the 32-bit negative value a 64-bit negative value. For example: bpf assembly excerpt of an inlined helper calling a kernel function and checking for a specific error: ; err = bpf_map_update_elem(&mymap, &key, &val, BPF_NOEXIST); ... 46: call 0xffffffffe103291c ; htab_map_update_elem ; if (err && err != -EEXIST) { 4b: cmp $0xffffffffffffffef,%rax ; cmp -EEXIST,%rax kernel function assembly excerpt of return value from `htab_map_update_elem` returning 32-bit int: movl $0xffffffef, %r9d ... movl %r9d, %eax ...results in the comparison: cmp $0xffffffffffffffef, $0x00000000ffffffef Fixes: bdb7b79b4ce8 ("bpf: Switch most helper return values from 32-bit int to 64-bit long") Tested-by: Eduard Zingerman Signed-off-by: JP Kobryn Link: https://lore.kernel.org/r/20230322194754.185781-3-inwardvessel@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 14 +++++++------- include/linux/filter.h | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3ef98fb92987..ec0df059f562 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -96,11 +96,11 @@ struct bpf_map_ops { /* funcs callable from userspace and from eBPF programs */ void *(*map_lookup_elem)(struct bpf_map *map, void *key); - int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); - int (*map_delete_elem)(struct bpf_map *map, void *key); - int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); - int (*map_pop_elem)(struct bpf_map *map, void *value); - int (*map_peek_elem)(struct bpf_map *map, void *value); + long (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); + long (*map_delete_elem)(struct bpf_map *map, void *key); + long (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); + long (*map_pop_elem)(struct bpf_map *map, void *value); + long (*map_peek_elem)(struct bpf_map *map, void *value); void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); /* funcs called by prog_array and perf_event_array map */ @@ -139,7 +139,7 @@ struct bpf_map_ops { struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); /* Misc helpers.*/ - int (*map_redirect)(struct bpf_map *map, u64 key, u64 flags); + long (*map_redirect)(struct bpf_map *map, u64 key, u64 flags); /* map_meta_equal must be implemented for maps that can be * used as an inner map. It is a runtime check to ensure @@ -157,7 +157,7 @@ struct bpf_map_ops { int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee); - int (*map_for_each_callback)(struct bpf_map *map, + long (*map_for_each_callback)(struct bpf_map *map, bpf_callback_t callback_fn, void *callback_ctx, u64 flags); diff --git a/include/linux/filter.h b/include/linux/filter.h index efa5d4a1677e..23c08c31bea9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1504,9 +1504,9 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, } #endif /* IS_ENABLED(CONFIG_IPV6) */ -static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index, - u64 flags, const u64 flag_mask, - void *lookup_elem(struct bpf_map *map, u32 key)) +static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 index, + u64 flags, const u64 flag_mask, + void *lookup_elem(struct bpf_map *map, u32 key)) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX; -- cgit v1.2.3 From b671c2067a04c0668df174ff5dfdb573d1f9b074 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 22 Mar 2023 20:23:58 -0700 Subject: bpf: Retire the struct_ops map kvalue->refcnt. We have replaced kvalue-refcnt with synchronize_rcu() to wait for an RCU grace period. Maintenance of kvalue->refcnt was a complicated task, as we had to simultaneously keep track of two reference counts: one for the reference count of bpf_map. When the kvalue->refcnt reaches zero, we also have to reduce the reference count on bpf_map - yet these steps are not performed in an atomic manner and require us to be vigilant when managing them. By eliminating kvalue->refcnt, we can make our maintenance more straightforward as the refcount of bpf_map is now solely managed! To prevent the trampoline image of a struct_ops from being released while it is still in use, we wait for an RCU grace period. The setsockopt(TCP_CONGESTION, "...") command allows you to change your socket's congestion control algorithm and can result in releasing the old struct_ops implementation. It is fine. However, this function is exposed through bpf_setsockopt(), it may be accessed by BPF programs as well. To ensure that the trampoline image belonging to struct_op can be safely called while its method is in use, the trampoline safeguarde the BPF program with rcu_read_lock(). Doing so prevents any destruction of the associated images before returning from a trampoline and requires us to wait for an RCU grace period. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20230323032405.3735486-2-kuifeng@meta.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ec0df059f562..f04098468d7a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1945,6 +1945,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); void bpf_map_inc(struct bpf_map *map); void bpf_map_inc_with_uref(struct bpf_map *map); +struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref); struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); -- cgit v1.2.3 From 68b04864ca425d1894c96b8141d4fba1181f11cb Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 22 Mar 2023 20:24:00 -0700 Subject: bpf: Create links for BPF struct_ops maps. Make bpf_link support struct_ops. Previously, struct_ops were always used alone without any associated links. Upon updating its value, a struct_ops would be activated automatically. Yet other BPF program types required to make a bpf_link with their instances before they could become active. Now, however, you can create an inactive struct_ops, and create a link to activate it later. With bpf_links, struct_ops has a behavior similar to other BPF program types. You can pin/unpin them from their links and the struct_ops will be deactivated when its link is removed while previously need someone to delete the value for it to be deactivated. bpf_links are responsible for registering their associated struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag set to create a bpf_link, while a structs without this flag behaves in the same manner as before and is registered upon updating its value. The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it used to craft the links for BPF struct_ops programs, but also to create links for BPF struct_ops them-self. Since the links of BPF struct_ops programs are only used to create trampolines internally, they are never seen in other contexts. Thus, they can be reused for struct_ops themself. To maintain a reference to the map supporting this link, we add bpf_struct_ops_link as an additional type. The pointer of the map is RCU and won't be necessary until later in the patchset. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20230323032405.3735486-4-kuifeng@meta.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f04098468d7a..8552279efe46 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1518,6 +1518,7 @@ struct bpf_struct_ops { void *kdata, const void *udata); int (*reg)(void *kdata); void (*unreg)(void *kdata); + int (*validate)(void *kdata); const struct btf_type *type; const struct btf_type *value_type; const char *name; @@ -1552,6 +1553,7 @@ static inline void bpf_module_put(const void *data, struct module *owner) else module_put(owner); } +int bpf_struct_ops_link_create(union bpf_attr *attr); #ifdef CONFIG_NET /* Define it here to avoid the use of forward declaration */ @@ -1592,6 +1594,11 @@ static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, { return -EINVAL; } +static inline int bpf_struct_ops_link_create(union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} + #endif #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM) -- cgit v1.2.3 From aef56f2e918bf8fc8de25f0b36e8c2aba44116ec Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 22 Mar 2023 20:24:02 -0700 Subject: bpf: Update the struct_ops of a bpf_link. By improving the BPF_LINK_UPDATE command of bpf(), it should allow you to conveniently switch between different struct_ops on a single bpf_link. This would enable smoother transitions from one struct_ops to another. The struct_ops maps passing along with BPF_LINK_UPDATE should have the BPF_F_LINK flag. Signed-off-by: Kui-Feng Lee Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230323032405.3735486-6-kuifeng@meta.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8552279efe46..2d8f3f639e68 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1476,6 +1476,8 @@ struct bpf_link_ops { void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq); int (*fill_link_info)(const struct bpf_link *link, struct bpf_link_info *info); + int (*update_map)(struct bpf_link *link, struct bpf_map *new_map, + struct bpf_map *old_map); }; struct bpf_tramp_link { @@ -1518,6 +1520,7 @@ struct bpf_struct_ops { void *kdata, const void *udata); int (*reg)(void *kdata); void (*unreg)(void *kdata); + int (*update)(void *kdata, void *old_kdata); int (*validate)(void *kdata); const struct btf_type *type; const struct btf_type *value_type; -- cgit v1.2.3 From e65a5c6edbc6ca4853e6076bd81db1a410592a09 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Mar 2023 14:52:42 -0700 Subject: bpf: Add a few bpf mem allocator functions This patch adds a few bpf mem allocator functions which will be used in the bpf_local_storage in a later patch. bpf_mem_cache_alloc_flags(..., gfp_t flags) is added. When the flags == GFP_KERNEL, it will fallback to __alloc(..., GFP_KERNEL). bpf_local_storage knows its running context is sleepable (GFP_KERNEL) and provides a better guarantee on memory allocation. bpf_local_storage has some uncommon cases that its selem cannot be reused immediately. It handles its own rcu_head and goes through a rcu_trace gp and then free it. bpf_mem_cache_raw_free() is added for direct free purpose without leaking the LLIST_NODE_SZ internal knowledge. During free time, the 'struct bpf_mem_alloc *ma' is no longer available. However, the caller should know if it is percpu memory or not and it can call different raw_free functions. bpf_local_storage does not support percpu value, so only the non-percpu 'bpf_mem_cache_raw_free()' is added in this patch. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230322215246.1675516-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index a7104af61ab4..3929be5743f4 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -31,5 +31,7 @@ void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr); /* kmem_cache_alloc/free equivalent: */ void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma); void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr); +void bpf_mem_cache_raw_free(void *ptr); +void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags); #endif /* _BPF_MEM_ALLOC_H */ -- cgit v1.2.3 From 08a7ce384e33e53e0732c500a8af67a73f8fceca Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Mar 2023 14:52:43 -0700 Subject: bpf: Use bpf_mem_cache_alloc/free in bpf_local_storage_elem This patch uses bpf_mem_alloc for the task and cgroup local storage that the bpf prog can easily get a hold of the storage owner's PTR_TO_BTF_ID. eg. bpf_get_current_task_btf() can be used in some of the kmalloc code path which will cause deadlock/recursion. bpf_mem_cache_alloc is deadlock free and will solve a legit use case in [1]. For sk storage, its batch creation benchmark shows a few percent regression when the sk create/destroy batch size is larger than 32. The sk creation/destruction happens much more often and depends on external traffic. Considering it is hypothetical to be able to cause deadlock with sk storage, it can cross the bridge to use bpf_mem_alloc till a legit (ie. useful) use case comes up. For inode storage, bpf_local_storage_destroy() is called before waiting for a rcu gp and its memory cannot be reused immediately. inode stays with kmalloc/kfree after the rcu [or tasks_trace] gp. A 'bool bpf_ma' argument is added to bpf_local_storage_map_alloc(). Only task and cgroup storage have 'bpf_ma == true' which means to use bpf_mem_cache_alloc/free(). This patch only changes selem to use bpf_mem_alloc for task and cgroup. The next patch will change the local_storage to use bpf_mem_alloc also for task and cgroup. Here is some more details on the changes: * memory allocation: After bpf_mem_cache_alloc(), the SDATA(selem)->data is zero-ed because bpf_mem_cache_alloc() could return a reused selem. It is to keep the existing bpf_map_kzalloc() behavior. Only SDATA(selem)->data is zero-ed. SDATA(selem)->data is the visible part to the bpf prog. No need to use zero_map_value() to do the zeroing because bpf_selem_free(..., reuse_now = true) ensures no bpf prog is using the selem before returning the selem through bpf_mem_cache_free(). For the internal fields of selem, they will be initialized when linking to the new smap and the new local_storage. When 'bpf_ma == false', nothing changes in this patch. It will stay with the bpf_map_kzalloc(). * memory free: The bpf_selem_free() and bpf_selem_free_rcu() are modified to handle the bpf_ma == true case. For the common selem free path where its owner is also being destroyed, the mem is freed in bpf_local_storage_destroy(), the owner (task and cgroup) has gone through a rcu gp. The memory can be reused immediately, so bpf_local_storage_destroy() will call bpf_selem_free(..., reuse_now = true) which will do bpf_mem_cache_free() for immediate reuse consideration. An exception is the delete elem code path. The delete elem code path is called from the helper bpf_*_storage_delete() and the syscall bpf_map_delete_elem(). This path is an unusual case for local storage because the common use case is to have the local storage staying with its owner life time so that the bpf prog and the user space does not have to monitor the owner's destruction. For the delete elem path, the selem cannot be reused immediately because there could be bpf prog using it. It will call bpf_selem_free(..., reuse_now = false) and it will wait for a rcu tasks trace gp before freeing the elem. The rcu callback is changed to do bpf_mem_cache_raw_free() instead of kfree(). When 'bpf_ma == false', it should be the same as before. __bpf_selem_free() is added to do the kfree_rcu and call_tasks_trace_rcu(). A few words on the 'reuse_now == true'. When 'reuse_now == true', it is still racing with bpf_local_storage_map_free which is under rcu protection, so it still needs to wait for a rcu gp instead of kfree(). Otherwise, the selem may be reused by slab for a totally different struct while the bpf_local_storage_map_free() is still using it (as a rcu reader). For the inode case, there may be other rcu readers also. In short, when bpf_ma == false and reuse_now == true => vanilla rcu. [1]: https://lore.kernel.org/bpf/20221118190109.1512674-1-namhyung@kernel.org/ Cc: Namhyung Kim Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230322215246.1675516-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index a34f61467a2f..30efbcab2798 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 @@ -55,6 +56,8 @@ struct bpf_local_storage_map { u32 bucket_log; u16 elem_size; u16 cache_idx; + struct bpf_mem_alloc selem_ma; + bool bpf_ma; }; struct bpf_local_storage_data { @@ -122,7 +125,8 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr); struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, - struct bpf_local_storage_cache *cache); + struct bpf_local_storage_cache *cache, + bool bpf_ma); struct bpf_local_storage_data * bpf_local_storage_lookup(struct bpf_local_storage *local_storage, -- cgit v1.2.3 From 6ae9d5e99e1dd26babdd9502759fa25a3fd348ad Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Mar 2023 14:52:44 -0700 Subject: bpf: Use bpf_mem_cache_alloc/free for bpf_local_storage This patch uses bpf_mem_cache_alloc/free for allocating and freeing bpf_local_storage for task and cgroup storage. The changes are similar to the previous patch. A few things that worth to mention for bpf_local_storage: The local_storage is freed when the last selem is deleted. Before deleting a selem from local_storage, it needs to retrieve the local_storage->smap because the bpf_selem_unlink_storage_nolock() may have set it to NULL. Note that local_storage->smap may have already been NULL when the selem created this local_storage has been removed. In this case, call_rcu will be used to free the local_storage. Also, the bpf_ma (true or false) value is needed before calling bpf_local_storage_free(). The bpf_ma can either be obtained from the local_storage->smap (if available) or any of its selem's smap. A new helper check_storage_bpf_ma() is added to obtain bpf_ma for a deleting bpf_local_storage. When bpf_local_storage_alloc getting a reused memory, all fields are either in the correct values or will be initialized. 'cache[]' must already be all NULLs. 'list' must be empty. Others will be initialized. Cc: Namhyung Kim Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230322215246.1675516-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 30efbcab2798..173ec7f43ed1 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -57,6 +57,7 @@ struct bpf_local_storage_map { u16 elem_size; u16 cache_idx; struct bpf_mem_alloc selem_ma; + struct bpf_mem_alloc storage_ma; bool bpf_ma; }; -- cgit v1.2.3 From b7e852a9ec96635168c04204fb7cf1f7390b9a8c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 3 Apr 2023 21:50:23 -0700 Subject: bpf: Remove unused arguments from btf_struct_access(). Remove unused arguments from btf_struct_access() callback. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230404045029.82870-3-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 3 +-- include/linux/filter.h | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2d8f3f639e68..4f689dda748f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -893,8 +893,7 @@ struct bpf_verifier_ops { struct bpf_prog *prog, u32 *target_size); int (*btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag); + int off, int size); }; struct bpf_prog_offload_ops { diff --git a/include/linux/filter.h b/include/linux/filter.h index 23c08c31bea9..5364b0c52c1d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -571,8 +571,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); extern struct mutex nf_conn_btf_access_lock; extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag); + int off, int size); typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, const struct bpf_insn *insnsi, -- cgit v1.2.3 From 63260df1396578226ac3134cf7f764690002e70e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 3 Apr 2023 21:50:24 -0700 Subject: bpf: Refactor btf_nested_type_is_trusted(). btf_nested_type_is_trusted() tries to find a struct member at corresponding offset. It works for flat structures and falls apart in more complex structs with nested structs. The offset->member search is already performed by btf_struct_walk() including nested structs. Reuse this work and pass {field name, field btf id} into btf_nested_type_is_trusted() instead of offset to make BTF_TYPE_SAFE*() logic more robust. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230404045029.82870-4-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4f689dda748f..002a811b6b90 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2263,7 +2263,7 @@ static inline bool bpf_tracing_btf_ctx_access(int off, int size, int btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag); + u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name); bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, const struct btf *need_btf, u32 need_type_id, @@ -2302,7 +2302,7 @@ struct bpf_core_ctx { bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, const char *suffix); + const char *field_name, u32 btf_id, const char *suffix); bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log, const struct btf *reg_btf, u32 reg_id, @@ -2517,7 +2517,8 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id) static inline int btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag) + u32 *next_btf_id, enum bpf_type_flag *flag, + const char **field_name) { return -EACCES; } -- cgit v1.2.3 From 34bf93472f8fb60b4189aa2872471017e739cf0a Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Thu, 30 Mar 2023 12:20:01 +0200 Subject: kallsyms: move module-related functions under correct configs Functions for searching module kallsyms should have non-empty definitions only if CONFIG_MODULES=y and CONFIG_KALLSYMS=y. Until now, only CONFIG_MODULES check was used for many of these, which may have caused complilation errors on some configs. This patch moves all relevant functions under the correct configs. Fixes: bd5314f8dd2d ("kallsyms, bpf: Move find_kallsyms_symbol_value out of internal header") Signed-off-by: Viktor Malik Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202303181535.RFDCnz3E-lkp@intel.com/ Link: https://lore.kernel.org/r/20230330102001.2183693-1-vmalik@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/module.h | 135 +++++++++++++++++++++++++++---------------------- 1 file changed, 74 insertions(+), 61 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 41cfd3be57e5..886d24877c7c 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -608,16 +608,6 @@ static inline bool within_module(unsigned long addr, const struct module *mod) /* Search for module by name: must be in a RCU-sched critical section. */ struct module *find_module(const char *name); -/* Returns 0 and fills in value, defined and namebuf, or -ERANGE if - symnum out of range. */ -int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, - char *name, char *module_name, int *exported); - -/* Look for this name: can be of form module:name. */ -unsigned long module_kallsyms_lookup_name(const char *name); - -unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name); - extern void __noreturn __module_put_and_kthread_exit(struct module *mod, long code); #define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code) @@ -664,17 +654,6 @@ static inline void __module_get(struct module *module) /* Dereference module function descriptor */ void *dereference_module_function_descriptor(struct module *mod, void *ptr); -/* For kallsyms to ask for address resolution. namebuf should be at - * least KSYM_NAME_LEN long: a pointer to namebuf is returned if - * found, otherwise NULL. */ -const char *module_address_lookup(unsigned long addr, - unsigned long *symbolsize, - unsigned long *offset, - char **modname, const unsigned char **modbuildid, - char *namebuf); -int lookup_module_symbol_name(unsigned long addr, char *symname); -int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name); - int register_module_notifier(struct notifier_block *nb); int unregister_module_notifier(struct notifier_block *nb); @@ -765,45 +744,6 @@ static inline void module_put(struct module *module) #define module_name(mod) "kernel" -/* For kallsyms to ask for address resolution. NULL means not found. */ -static inline const char *module_address_lookup(unsigned long addr, - unsigned long *symbolsize, - unsigned long *offset, - char **modname, - const unsigned char **modbuildid, - char *namebuf) -{ - return NULL; -} - -static inline int lookup_module_symbol_name(unsigned long addr, char *symname) -{ - return -ERANGE; -} - -static inline int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name) -{ - return -ERANGE; -} - -static inline int module_get_kallsym(unsigned int symnum, unsigned long *value, - char *type, char *name, - char *module_name, int *exported) -{ - return -ERANGE; -} - -static inline unsigned long module_kallsyms_lookup_name(const char *name) -{ - return 0; -} - -static inline unsigned long find_kallsyms_symbol_value(struct module *mod, - const char *name) -{ - return 0; -} - static inline int register_module_notifier(struct notifier_block *nb) { /* no events will happen anyway, so this can always succeed */ @@ -899,7 +839,36 @@ int module_kallsyms_on_each_symbol(const char *modname, int (*fn)(void *, const char *, struct module *, unsigned long), void *data); -#else + +/* For kallsyms to ask for address resolution. namebuf should be at + * least KSYM_NAME_LEN long: a pointer to namebuf is returned if + * found, otherwise NULL. + */ +const char *module_address_lookup(unsigned long addr, + unsigned long *symbolsize, + unsigned long *offset, + char **modname, const unsigned char **modbuildid, + char *namebuf); +int lookup_module_symbol_name(unsigned long addr, char *symname); +int lookup_module_symbol_attrs(unsigned long addr, + unsigned long *size, + unsigned long *offset, + char *modname, + char *name); + +/* Returns 0 and fills in value, defined and namebuf, or -ERANGE if + * symnum out of range. + */ +int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *name, char *module_name, int *exported); + +/* Look for this name: can be of form module:name. */ +unsigned long module_kallsyms_lookup_name(const char *name); + +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name); + +#else /* CONFIG_MODULES && CONFIG_KALLSYMS */ + static inline int module_kallsyms_on_each_symbol(const char *modname, int (*fn)(void *, const char *, struct module *, unsigned long), @@ -907,6 +876,50 @@ static inline int module_kallsyms_on_each_symbol(const char *modname, { return -EOPNOTSUPP; } + +/* For kallsyms to ask for address resolution. NULL means not found. */ +static inline const char *module_address_lookup(unsigned long addr, + unsigned long *symbolsize, + unsigned long *offset, + char **modname, + const unsigned char **modbuildid, + char *namebuf) +{ + return NULL; +} + +static inline int lookup_module_symbol_name(unsigned long addr, char *symname) +{ + return -ERANGE; +} + +static inline int lookup_module_symbol_attrs(unsigned long addr, + unsigned long *size, + unsigned long *offset, + char *modname, + char *name) +{ + return -ERANGE; +} + +static inline int module_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, + char *module_name, int *exported) +{ + return -ERANGE; +} + +static inline unsigned long module_kallsyms_lookup_name(const char *name) +{ + return 0; +} + +static inline unsigned long find_kallsyms_symbol_value(struct module *mod, + const char *name) +{ + return 0; +} + #endif /* CONFIG_MODULES && CONFIG_KALLSYMS */ #endif /* _LINUX_MODULE_H */ -- cgit v1.2.3 From 4294a0a7ab6282c3d92f03de84e762dda993c93d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:47 -0700 Subject: bpf: Split off basic BPF verifier log into separate file kernel/bpf/verifier.c file is large and growing larger all the time. So it's good to start splitting off more or less self-contained parts into separate files to keep source code size (somewhat) somewhat under control. This patch is a one step in this direction, moving some of BPF verifier log routines into a separate kernel/bpf/log.c. Right now it's most low-level and isolated routines to append data to log, reset log to previous position, etc. Eventually we could probably move verifier state printing logic here as well, but this patch doesn't attempt to do that yet. Subsequent patches will add more logic to verifier log management, so having basics in a separate file will make sure verifier.c doesn't grow more with new changes. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-2-andrii@kernel.org --- include/linux/bpf_verifier.h | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 81d525d057c7..83dff25545ee 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -498,11 +498,6 @@ struct bpf_verifier_log { u32 len_total; }; -static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) -{ - return log->len_used >= log->len_total - 1; -} - #define BPF_LOG_LEVEL1 1 #define BPF_LOG_LEVEL2 2 #define BPF_LOG_STATS 4 @@ -512,6 +507,11 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) #define BPF_LOG_MIN_ALIGNMENT 8U #define BPF_LOG_ALIGNMENT 40U +static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) +{ + return log->len_used >= log->len_total - 1; +} + static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { return log && @@ -519,13 +519,6 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) log->level == BPF_LOG_KERNEL); } -static inline bool -bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) -{ - return log->len_total >= 128 && log->len_total <= UINT_MAX >> 2 && - log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK); -} - #define BPF_MAX_SUBPROGS 256 struct bpf_subprog_info { @@ -608,12 +601,14 @@ struct bpf_verifier_env { char type_str_buf[TYPE_STR_BUF_LEN]; }; +bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log); __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, const char *fmt, ...); +void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos); static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { -- cgit v1.2.3 From 1216640938035e63bdbd32438e91c9bcc1fd8ee1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:49 -0700 Subject: bpf: Switch BPF verifier log to be a rotating log by default Currently, if user-supplied log buffer to collect BPF verifier log turns out to be too small to contain full log, bpf() syscall returns -ENOSPC, fails BPF program verification/load, and preserves first N-1 bytes of the verifier log (where N is the size of user-supplied buffer). This is problematic in a bunch of common scenarios, especially when working with real-world BPF programs that tend to be pretty complex as far as verification goes and require big log buffers. Typically, it's when debugging tricky cases at log level 2 (verbose). Also, when BPF program is successfully validated, log level 2 is the only way to actually see verifier state progression and all the important details. Even with log level 1, it's possible to get -ENOSPC even if the final verifier log fits in log buffer, if there is a code path that's deep enough to fill up entire log, even if normally it would be reset later on (there is a logic to chop off successfully validated portions of BPF verifier log). In short, it's not always possible to pre-size log buffer. Also, what's worse, in practice, the end of the log most often is way more important than the beginning, but verifier stops emitting log as soon as initial log buffer is filled up. This patch switches BPF verifier log behavior to effectively behave as rotating log. That is, if user-supplied log buffer turns out to be too short, verifier will keep overwriting previously written log, effectively treating user's log buffer as a ring buffer. -ENOSPC is still going to be returned at the end, to notify user that log contents was truncated, but the important last N bytes of the log would be returned, which might be all that user really needs. This consistent -ENOSPC behavior, regardless of rotating or fixed log behavior, allows to prevent backwards compatibility breakage. The only user-visible change is which portion of verifier log user ends up seeing *if buffer is too small*. Given contents of verifier log itself is not an ABI, there is no breakage due to this behavior change. Specialized tools that rely on specific contents of verifier log in -ENOSPC scenario are expected to be easily adapted to accommodate old and new behaviors. Importantly, though, to preserve good user experience and not require every user-space application to adopt to this new behavior, before exiting to user-space verifier will rotate log (in place) to make it start at the very beginning of user buffer as a continuous zero-terminated string. The contents will be a chopped off N-1 last bytes of full verifier log, of course. Given beginning of log is sometimes important as well, we add BPF_LOG_FIXED (which equals 8) flag to force old behavior, which allows tools like veristat to request first part of verifier log, if necessary. BPF_LOG_FIXED flag is also a simple and straightforward way to check if BPF verifier supports rotating behavior. On the implementation side, conceptually, it's all simple. We maintain 64-bit logical start and end positions. If we need to truncate the log, start position will be adjusted accordingly to lag end position by N bytes. We then use those logical positions to calculate their matching actual positions in user buffer and handle wrap around the end of the buffer properly. Finally, right before returning from bpf_check(), we rotate user log buffer contents in-place as necessary, to make log contents contiguous. See comments in relevant functions for details. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Reviewed-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-4-andrii@kernel.org --- include/linux/bpf_verifier.h | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 83dff25545ee..4c926227f612 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -491,25 +491,42 @@ struct bpf_insn_aux_data { #define BPF_VERIFIER_TMP_LOG_SIZE 1024 struct bpf_verifier_log { - u32 level; - char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; + /* Logical start and end positions of a "log window" of the verifier log. + * start_pos == 0 means we haven't truncated anything. + * Once truncation starts to happen, start_pos + len_total == end_pos, + * except during log reset situations, in which (end_pos - start_pos) + * might get smaller than len_total (see bpf_vlog_reset()). + * Generally, (end_pos - start_pos) gives number of useful data in + * user log buffer. + */ + u64 start_pos; + u64 end_pos; char __user *ubuf; - u32 len_used; + u32 level; u32 len_total; + char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; }; #define BPF_LOG_LEVEL1 1 #define BPF_LOG_LEVEL2 2 #define BPF_LOG_STATS 4 +#define BPF_LOG_FIXED 8 #define BPF_LOG_LEVEL (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2) -#define BPF_LOG_MASK (BPF_LOG_LEVEL | BPF_LOG_STATS) +#define BPF_LOG_MASK (BPF_LOG_LEVEL | BPF_LOG_STATS | BPF_LOG_FIXED) #define BPF_LOG_KERNEL (BPF_LOG_MASK + 1) /* kernel internal flag */ #define BPF_LOG_MIN_ALIGNMENT 8U #define BPF_LOG_ALIGNMENT 40U +static inline u32 bpf_log_used(const struct bpf_verifier_log *log) +{ + return log->end_pos - log->start_pos; +} + static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) { - return log->len_used >= log->len_total - 1; + if (log->level & BPF_LOG_FIXED) + return bpf_log_used(log) >= log->len_total - 1; + return false; } static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) @@ -596,7 +613,7 @@ struct bpf_verifier_env { u32 scratched_regs; /* Same as scratched_regs but for stack slots */ u64 scratched_stack_slots; - u32 prev_log_len, prev_insn_print_len; + u64 prev_log_pos, prev_insn_print_pos; /* buffer used in reg_type_str() to generate reg_type string */ char type_str_buf[TYPE_STR_BUF_LEN]; }; @@ -608,7 +625,9 @@ __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, const char *fmt, ...); -void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos); +void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos); +void bpf_vlog_finalize(struct bpf_verifier_log *log); +bool bpf_vlog_truncated(const struct bpf_verifier_log *log); static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { -- cgit v1.2.3 From fa1c7d5cc404ac3b6e6b4ab6d00b07c76bd819be Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:57 -0700 Subject: bpf: Keep track of total log content size in both fixed and rolling modes Change how we do accounting in BPF_LOG_FIXED mode and adopt log->end_pos as *logical* log position. This means that we can go beyond physical log buffer size now and be able to tell what log buffer size should be to fit entire log contents without -ENOSPC. To do this for BPF_LOG_FIXED mode, we need to remove a short-circuiting logic of not vsnprintf()'ing further log content once we filled up user-provided buffer, which is done by bpf_verifier_log_needed() checks. We modify these checks to always keep going if log->level is non-zero (i.e., log is requested), even if log->ubuf was NULL'ed out due to copying data to user-space, or if entire log buffer is physically full. We adopt bpf_verifier_vlog() routine to work correctly with log->ubuf == NULL condition, performing log formatting into temporary kernel buffer, doing all the necessary accounting, but just avoiding copying data out if buffer is full or NULL'ed out. With these changes, it's now possible to do this sort of determination of log contents size in both BPF_LOG_FIXED and default rolling log mode. We need to keep in mind bpf_vlog_reset(), though, which shrinks log contents after successful verification of a particular code path. This log reset means that log->end_pos isn't always increasing, so to return back to users what should be the log buffer size to fit all log content without causing -ENOSPC even in the presence of log resetting, we need to keep maximum over "lifetime" of logging. We do this accounting in bpf_vlog_update_len_max() helper. A related and subtle aspect is that with this logical log->end_pos even in BPF_LOG_FIXED mode we could temporary "overflow" buffer, but then reset it back with bpf_vlog_reset() to a position inside user-supplied log_buf. In such situation we still want to properly maintain terminating zero. We will eventually return -ENOSPC even if final log buffer is small (we detect this through log->len_max check). This behavior is simpler to reason about and is consistent with current behavior of verifier log. Handling of this required a small addition to bpf_vlog_reset() logic to avoid doing put_user() beyond physical log buffer dimensions. Another issue to keep in mind is that we limit log buffer size to 32-bit value and keep such log length as u32, but theoretically verifier could produce huge log stretching beyond 4GB. Instead of keeping (and later returning) 64-bit log length, we cap it at UINT_MAX. Current UAPI makes it impossible to specify log buffer size bigger than 4GB anyways, so we don't really loose anything here and keep everything consistently 32-bit in UAPI. This property will be utilized in next patch. Doing the same determination of maximum log buffer for rolling mode is trivial, as log->end_pos and log->start_pos are already logical positions, so there is nothing new there. These changes do incidentally fix one small issue with previous logging logic. Previously, if use provided log buffer of size N, and actual log output was exactly N-1 bytes + terminating \0, kernel logic coun't distinguish this condition from log truncation scenario which would end up with truncated log contents of N-1 bytes + terminating \0 as well. But now with log->end_pos being logical position that could go beyond actual log buffer size, we can distinguish these two conditions, which we do in this patch. This plays nicely with returning log_size_actual (implemented in UAPI in the next patch), as we can now guarantee that if user takes such log_size_actual and provides log buffer of that exact size, they will not get -ENOSPC in return. All in all, all these changes do conceptually unify fixed and rolling log modes much better, and allow a nice feature requested by users: knowing what should be the size of the buffer to avoid -ENOSPC. We'll plumb this through the UAPI and the code in the next patch. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-12-andrii@kernel.org --- include/linux/bpf_verifier.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 4c926227f612..98d2eb382dbb 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -504,6 +504,7 @@ struct bpf_verifier_log { char __user *ubuf; u32 level; u32 len_total; + u32 len_max; char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; }; @@ -517,23 +518,16 @@ struct bpf_verifier_log { #define BPF_LOG_MIN_ALIGNMENT 8U #define BPF_LOG_ALIGNMENT 40U -static inline u32 bpf_log_used(const struct bpf_verifier_log *log) -{ - return log->end_pos - log->start_pos; -} - static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) { if (log->level & BPF_LOG_FIXED) - return bpf_log_used(log) >= log->len_total - 1; + return log->end_pos >= log->len_total; return false; } static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { - return log && - ((log->level && log->ubuf && !bpf_verifier_log_full(log)) || - log->level == BPF_LOG_KERNEL); + return log && log->level; } #define BPF_MAX_SUBPROGS 256 -- cgit v1.2.3 From 47a71c1f9af0a334c9dfa97633c41de4feda4287 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:58 -0700 Subject: bpf: Add log_true_size output field to return necessary log buffer size Add output-only log_true_size and btf_log_true_size field to BPF_PROG_LOAD and BPF_BTF_LOAD commands, respectively. It will return the size of log buffer necessary to fit in all the log contents at specified log_level. This is very useful for BPF loader libraries like libbpf to be able to size log buffer correctly, but could be used by users directly, if necessary, as well. This patch plumbs all this through the code, taking into account actual bpf_attr size provided by user to determine if these new fields are expected by users. And if they are, set them from kernel on return. We refactory btf_parse() function to accommodate this, moving attr and uattr handling inside it. The rest is very straightforward code, which is split from the logging accounting changes in the previous patch to make it simpler to review logic vs UAPI changes. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-13-andrii@kernel.org --- include/linux/bpf.h | 2 +- include/linux/btf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 002a811b6b90..2c6095bd7d69 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2175,7 +2175,7 @@ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size); /* verify correctness of eBPF program */ -int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr); +int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size); #ifndef CONFIG_BPF_JIT_ALWAYS_ON void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); diff --git a/include/linux/btf.h b/include/linux/btf.h index d53b10cc55f2..495250162422 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -125,7 +125,7 @@ extern const struct file_operations btf_fops; void btf_get(struct btf *btf); void btf_put(struct btf *btf); -int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr); +int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_sz); struct btf *btf_get_by_fd(int fd); int btf_get_info_by_fd(const struct btf *btf, const union bpf_attr *attr, -- cgit v1.2.3 From bdcab4144f5da97cc0fa7e1dd63b8475e10c8f0a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:59 -0700 Subject: bpf: Simplify internal verifier log interface Simplify internal verifier log API down to bpf_vlog_init() and bpf_vlog_finalize(). The former handles input arguments validation in one place and makes it easier to change it. The latter subsumes -ENOSPC (truncation) and -EFAULT handling and simplifies both caller's code (bpf_check() and btf_parse()). For btf_parse(), this patch also makes sure that verifier log finalization happens even if there is some error condition during BTF verification process prior to normal finalization step. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-14-andrii@kernel.org --- include/linux/bpf_verifier.h | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 98d2eb382dbb..f03852b89d28 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -518,13 +518,6 @@ struct bpf_verifier_log { #define BPF_LOG_MIN_ALIGNMENT 8U #define BPF_LOG_ALIGNMENT 40U -static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) -{ - if (log->level & BPF_LOG_FIXED) - return log->end_pos >= log->len_total; - return false; -} - static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { return log && log->level; @@ -612,16 +605,16 @@ struct bpf_verifier_env { char type_str_buf[TYPE_STR_BUF_LEN]; }; -bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log); __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, const char *fmt, ...); +int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, + char __user *log_buf, u32 log_size); void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos); -void bpf_vlog_finalize(struct bpf_verifier_log *log); -bool bpf_vlog_truncated(const struct bpf_verifier_log *log); +int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual); static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { -- cgit v1.2.3