summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-04-14 18:04:04 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-04-14 18:04:04 -0700
commitf5ad4101009e7f5f5984ffea6923d4fcd470932a (patch)
treec9c25bb09794d372c3028113d1f89f2a2cbcbca4 /kernel
parente997ac58ad0b47141c62c79cde8356fe5633287a (diff)
parent71b500afd2f7336f5b6c6026f2af546fc079be26 (diff)
Merge tag 'bpf-next-7.1' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Pull bpf updates from Alexei Starovoitov: - Welcome new BPF maintainers: Kumar Kartikeya Dwivedi, Eduard Zingerman while Martin KaFai Lau reduced his load to Reviwer. - Lots of fixes everywhere from many first time contributors. Thank you All. - Diff stat is dominated by mechanical split of verifier.c into multiple components: - backtrack.c: backtracking logic and jump history - states.c: state equivalence - cfg.c: control flow graph, postorder, strongly connected components - liveness.c: register and stack liveness - fixups.c: post-verification passes: instruction patching, dead code removal, bpf_loop inlining, finalize fastcall 8k line were moved. verifier.c still stands at 20k lines. Further refactoring is planned for the next release. - Replace dynamic stack liveness with static stack liveness based on data flow analysis. This improved the verification time by 2x for some programs and equally reduced memory consumption. New logic is in liveness.c and supported by constant folding in const_fold.c (Eduard Zingerman, Alexei Starovoitov) - Introduce BTF layout to ease addition of new BTF kinds (Alan Maguire) - Use kmalloc_nolock() universally in BPF local storage (Amery Hung) - Fix several bugs in linked registers delta tracking (Daniel Borkmann) - Improve verifier support of arena pointers (Emil Tsalapatis) - Improve verifier tracking of register bounds in min/max and tnum domains (Harishankar Vishwanathan, Paul Chaignon, Hao Sun) - Further extend support for implicit arguments in the verifier (Ihor Solodrai) - Add support for nop,nop5 instruction combo for USDT probes in libbpf (Jiri Olsa) - Support merging multiple module BTFs (Josef Bacik) - Extend applicability of bpf_kptr_xchg (Kaitao Cheng) - Retire rcu_trace_implies_rcu_gp() (Kumar Kartikeya Dwivedi) - Support variable offset context access for 'syscall' programs (Kumar Kartikeya Dwivedi) - Migrate bpf_task_work and dynptr to kmalloc_nolock() (Mykyta Yatsenko) - Fix UAF in in open-coded task_vma iterator (Puranjay Mohan) * tag 'bpf-next-7.1' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (241 commits) selftests/bpf: cover short IPv4/IPv6 inputs with adjust_room bpf: reject short IPv4/IPv6 inputs in bpf_prog_test_run_skb selftests/bpf: Use memfd_create instead of shm_open in cgroup_iter_memcg selftests/bpf: Add test for cgroup storage OOB read bpf: Fix OOB in pcpu_init_value selftests/bpf: Fix reg_bounds to match new tnum-based refinement selftests/bpf: Add tests for non-arena/arena operations bpf: Allow instructions with arena source and non-arena dest registers bpftool: add missing fsession to the usage and docs of bpftool docs/bpf: add missing fsession attach type to docs bpf: add missing fsession to the verifier log bpf: Move BTF checking logic into check_btf.c bpf: Move backtracking logic to backtrack.c bpf: Move state equivalence logic to states.c bpf: Move check_cfg() into cfg.c bpf: Move compute_insn_live_regs() into liveness.c bpf: Move fixup/post-processing logic from verifier.c into fixups.c bpf: Simplify do_check_insn() bpf: Move checks for reserved fields out of the main pass bpf: Delete unused variable ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/arraymap.c4
-rw-r--r--kernel/bpf/backtrack.c934
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c11
-rw-r--r--kernel/bpf/bpf_inode_storage.c11
-rw-r--r--kernel/bpf/bpf_local_storage.c157
-rw-r--r--kernel/bpf/bpf_lsm.c7
-rw-r--r--kernel/bpf/bpf_task_storage.c11
-rw-r--r--kernel/bpf/btf.c86
-rw-r--r--kernel/bpf/cfg.c872
-rw-r--r--kernel/bpf/check_btf.c463
-rw-r--r--kernel/bpf/const_fold.c396
-rw-r--r--kernel/bpf/core.c95
-rw-r--r--kernel/bpf/cpumap.c5
-rw-r--r--kernel/bpf/devmap.c5
-rw-r--r--kernel/bpf/fixups.c2457
-rw-r--r--kernel/bpf/hashtab.c6
-rw-r--r--kernel/bpf/helpers.c243
-rw-r--r--kernel/bpf/liveness.c2400
-rw-r--r--kernel/bpf/local_storage.c2
-rw-r--r--kernel/bpf/log.c62
-rw-r--r--kernel/bpf/memalloc.c33
-rw-r--r--kernel/bpf/offload.c10
-rw-r--r--kernel/bpf/states.c1563
-rw-r--r--kernel/bpf/syscall.c44
-rw-r--r--kernel/bpf/task_iter.c151
-rw-r--r--kernel/bpf/tnum.c46
-rw-r--r--kernel/bpf/verifier.c10000
-rw-r--r--kernel/trace/trace_kprobe.c8
29 files changed, 11205 insertions, 8880 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 79cf22860a99..399007b67a92 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,11 +6,12 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
endif
CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o const_fold.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
+obj-$(CONFIG_BPF_SYSCALL) += fixups.o cfg.o states.o backtrack.o check_btf.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 33de68c95ad8..5e25e0353509 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1015,8 +1015,10 @@ static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer)
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
- for (i = 0; i < array->map.max_entries; i++)
+ for (i = 0; i < array->map.max_entries; i++) {
__fd_array_map_delete_elem(map, &i, need_defer);
+ cond_resched();
+ }
}
static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
diff --git a/kernel/bpf/backtrack.c b/kernel/bpf/backtrack.c
new file mode 100644
index 000000000000..854731dc93fe
--- /dev/null
+++ b/kernel/bpf/backtrack.c
@@ -0,0 +1,934 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/filter.h>
+#include <linux/bitmap.h>
+
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
+
+/* for any branch, call, exit record the history of jmps in the given state */
+int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
+ int insn_flags, u64 linked_regs)
+{
+ u32 cnt = cur->jmp_history_cnt;
+ struct bpf_jmp_history_entry *p;
+ size_t alloc_size;
+
+ /* combine instruction flags if we already recorded this instruction */
+ if (env->cur_hist_ent) {
+ /* atomic instructions push insn_flags twice, for READ and
+ * WRITE sides, but they should agree on stack slot
+ */
+ verifier_bug_if((env->cur_hist_ent->flags & insn_flags) &&
+ (env->cur_hist_ent->flags & insn_flags) != insn_flags,
+ env, "insn history: insn_idx %d cur flags %x new flags %x",
+ env->insn_idx, env->cur_hist_ent->flags, insn_flags);
+ env->cur_hist_ent->flags |= insn_flags;
+ verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env,
+ "insn history: insn_idx %d linked_regs: %#llx",
+ env->insn_idx, env->cur_hist_ent->linked_regs);
+ env->cur_hist_ent->linked_regs = linked_regs;
+ return 0;
+ }
+
+ cnt++;
+ alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
+ p = krealloc(cur->jmp_history, alloc_size, GFP_KERNEL_ACCOUNT);
+ if (!p)
+ return -ENOMEM;
+ cur->jmp_history = p;
+
+ p = &cur->jmp_history[cnt - 1];
+ p->idx = env->insn_idx;
+ p->prev_idx = env->prev_insn_idx;
+ p->flags = insn_flags;
+ p->linked_regs = linked_regs;
+ cur->jmp_history_cnt = cnt;
+ env->cur_hist_ent = p;
+
+ return 0;
+}
+
+static bool is_atomic_load_insn(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_LOAD_ACQ;
+}
+
+static bool is_atomic_fetch_insn(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_ATOMIC &&
+ (insn->imm & BPF_FETCH);
+}
+
+static int insn_stack_access_spi(int insn_flags)
+{
+ return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK;
+}
+
+static int insn_stack_access_frameno(int insn_flags)
+{
+ return insn_flags & INSN_F_FRAMENO_MASK;
+}
+
+/* Backtrack one insn at a time. If idx is not at the top of recorded
+ * history then previous instruction came from straight line execution.
+ * Return -ENOENT if we exhausted all instructions within given state.
+ *
+ * It's legal to have a bit of a looping with the same starting and ending
+ * insn index within the same state, e.g.: 3->4->5->3, so just because current
+ * instruction index is the same as state's first_idx doesn't mean we are
+ * done. If there is still some jump history left, we should keep going. We
+ * need to take into account that we might have a jump history between given
+ * state's parent and itself, due to checkpointing. In this case, we'll have
+ * history entry recording a jump from last instruction of parent state and
+ * first instruction of given state.
+ */
+static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
+ u32 *history)
+{
+ u32 cnt = *history;
+
+ if (i == st->first_insn_idx) {
+ if (cnt == 0)
+ return -ENOENT;
+ if (cnt == 1 && st->jmp_history[0].idx == i)
+ return -ENOENT;
+ }
+
+ if (cnt && st->jmp_history[cnt - 1].idx == i) {
+ i = st->jmp_history[cnt - 1].prev_idx;
+ (*history)--;
+ } else {
+ i--;
+ }
+ return i;
+}
+
+static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
+ u32 hist_end, int insn_idx)
+{
+ if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
+ return &st->jmp_history[hist_end - 1];
+ return NULL;
+}
+
+static inline void bt_init(struct backtrack_state *bt, u32 frame)
+{
+ bt->frame = frame;
+}
+
+static inline void bt_reset(struct backtrack_state *bt)
+{
+ struct bpf_verifier_env *env = bt->env;
+
+ memset(bt, 0, sizeof(*bt));
+ bt->env = env;
+}
+
+static inline u32 bt_empty(struct backtrack_state *bt)
+{
+ u64 mask = 0;
+ int i;
+
+ for (i = 0; i <= bt->frame; i++)
+ mask |= bt->reg_masks[i] | bt->stack_masks[i];
+
+ return mask == 0;
+}
+
+static inline int bt_subprog_enter(struct backtrack_state *bt)
+{
+ if (bt->frame == MAX_CALL_FRAMES - 1) {
+ verifier_bug(bt->env, "subprog enter from frame %d", bt->frame);
+ return -EFAULT;
+ }
+ bt->frame++;
+ return 0;
+}
+
+static inline int bt_subprog_exit(struct backtrack_state *bt)
+{
+ if (bt->frame == 0) {
+ verifier_bug(bt->env, "subprog exit from frame 0");
+ return -EFAULT;
+ }
+ bt->frame--;
+ return 0;
+}
+
+static inline void bt_clear_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+ bt->reg_masks[frame] &= ~(1 << reg);
+}
+
+static inline void bt_set_reg(struct backtrack_state *bt, u32 reg)
+{
+ bpf_bt_set_frame_reg(bt, bt->frame, reg);
+}
+
+static inline void bt_clear_reg(struct backtrack_state *bt, u32 reg)
+{
+ bt_clear_frame_reg(bt, bt->frame, reg);
+}
+
+static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ bt->stack_masks[frame] &= ~(1ull << slot);
+}
+
+static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame)
+{
+ return bt->reg_masks[frame];
+}
+
+static inline u32 bt_reg_mask(struct backtrack_state *bt)
+{
+ return bt->reg_masks[bt->frame];
+}
+
+static inline u64 bt_frame_stack_mask(struct backtrack_state *bt, u32 frame)
+{
+ return bt->stack_masks[frame];
+}
+
+static inline u64 bt_stack_mask(struct backtrack_state *bt)
+{
+ return bt->stack_masks[bt->frame];
+}
+
+static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
+{
+ return bt->reg_masks[bt->frame] & (1 << reg);
+}
+
+
+/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */
+static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask)
+{
+ DECLARE_BITMAP(mask, 64);
+ bool first = true;
+ int i, n;
+
+ buf[0] = '\0';
+
+ bitmap_from_u64(mask, reg_mask);
+ for_each_set_bit(i, mask, 32) {
+ n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i);
+ first = false;
+ buf += n;
+ buf_sz -= n;
+ if (buf_sz < 0)
+ break;
+ }
+}
+/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */
+void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
+{
+ DECLARE_BITMAP(mask, 64);
+ bool first = true;
+ int i, n;
+
+ buf[0] = '\0';
+
+ bitmap_from_u64(mask, stack_mask);
+ for_each_set_bit(i, mask, 64) {
+ n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8);
+ first = false;
+ buf += n;
+ buf_sz -= n;
+ if (buf_sz < 0)
+ break;
+ }
+}
+
+
+/* For given verifier state backtrack_insn() is called from the last insn to
+ * the first insn. Its purpose is to compute a bitmask of registers and
+ * stack slots that needs precision in the parent verifier state.
+ *
+ * @idx is an index of the instruction we are currently processing;
+ * @subseq_idx is an index of the subsequent instruction that:
+ * - *would be* executed next, if jump history is viewed in forward order;
+ * - *was* processed previously during backtracking.
+ */
+static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
+ struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
+{
+ struct bpf_insn *insn = env->prog->insnsi + idx;
+ u8 class = BPF_CLASS(insn->code);
+ u8 opcode = BPF_OP(insn->code);
+ u8 mode = BPF_MODE(insn->code);
+ u32 dreg = insn->dst_reg;
+ u32 sreg = insn->src_reg;
+ u32 spi, i, fr;
+
+ if (insn->code == 0)
+ return 0;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt));
+ verbose(env, "mark_precise: frame%d: regs=%s ",
+ bt->frame, env->tmp_str_buf);
+ bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
+ verbose(env, "stack=%s before ", env->tmp_str_buf);
+ verbose(env, "%d: ", idx);
+ bpf_verbose_insn(env, insn);
+ }
+
+ /* If there is a history record that some registers gained range at this insn,
+ * propagate precision marks to those registers, so that bt_is_reg_set()
+ * accounts for these registers.
+ */
+ bpf_bt_sync_linked_regs(bt, hist);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (!bt_is_reg_set(bt, dreg))
+ return 0;
+ if (opcode == BPF_END || opcode == BPF_NEG) {
+ /* sreg is reserved and unused
+ * dreg still need precision before this insn
+ */
+ return 0;
+ } else if (opcode == BPF_MOV) {
+ if (BPF_SRC(insn->code) == BPF_X) {
+ /* dreg = sreg or dreg = (s8, s16, s32)sreg
+ * dreg needs precision after this insn
+ * sreg needs precision before this insn
+ */
+ bt_clear_reg(bt, dreg);
+ if (sreg != BPF_REG_FP)
+ bt_set_reg(bt, sreg);
+ } else {
+ /* dreg = K
+ * dreg needs precision after this insn.
+ * Corresponding register is already marked
+ * as precise=true in this verifier state.
+ * No further markings in parent are necessary
+ */
+ bt_clear_reg(bt, dreg);
+ }
+ } else {
+ if (BPF_SRC(insn->code) == BPF_X) {
+ /* dreg += sreg
+ * both dreg and sreg need precision
+ * before this insn
+ */
+ if (sreg != BPF_REG_FP)
+ bt_set_reg(bt, sreg);
+ } /* else dreg += K
+ * dreg still needs precision before this insn
+ */
+ }
+ } else if (class == BPF_LDX ||
+ is_atomic_load_insn(insn) ||
+ is_atomic_fetch_insn(insn)) {
+ u32 load_reg = dreg;
+
+ /*
+ * Atomic fetch operation writes the old value into
+ * a register (sreg or r0) and if it was tracked for
+ * precision, propagate to the stack slot like we do
+ * in regular ldx.
+ */
+ if (is_atomic_fetch_insn(insn))
+ load_reg = insn->imm == BPF_CMPXCHG ?
+ BPF_REG_0 : sreg;
+
+ if (!bt_is_reg_set(bt, load_reg))
+ return 0;
+ bt_clear_reg(bt, load_reg);
+
+ /* scalars can only be spilled into stack w/o losing precision.
+ * Load from any other memory can be zero extended.
+ * The desire to keep that precision is already indicated
+ * by 'precise' mark in corresponding register of this state.
+ * No further tracking necessary.
+ */
+ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
+ return 0;
+ /* dreg = *(u64 *)[fp - off] was a fill from the stack.
+ * that [fp - off] slot contains scalar that needs to be
+ * tracked with precision
+ */
+ spi = insn_stack_access_spi(hist->flags);
+ fr = insn_stack_access_frameno(hist->flags);
+ bpf_bt_set_frame_slot(bt, fr, spi);
+ } else if (class == BPF_STX || class == BPF_ST) {
+ if (bt_is_reg_set(bt, dreg))
+ /* stx & st shouldn't be using _scalar_ dst_reg
+ * to access memory. It means backtracking
+ * encountered a case of pointer subtraction.
+ */
+ return -ENOTSUPP;
+ /* scalars can only be spilled into stack */
+ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
+ return 0;
+ spi = insn_stack_access_spi(hist->flags);
+ fr = insn_stack_access_frameno(hist->flags);
+ if (!bt_is_frame_slot_set(bt, fr, spi))
+ return 0;
+ bt_clear_frame_slot(bt, fr, spi);
+ if (class == BPF_STX)
+ bt_set_reg(bt, sreg);
+ } else if (class == BPF_JMP || class == BPF_JMP32) {
+ if (bpf_pseudo_call(insn)) {
+ int subprog_insn_idx, subprog;
+
+ subprog_insn_idx = idx + insn->imm + 1;
+ subprog = bpf_find_subprog(env, subprog_insn_idx);
+ if (subprog < 0)
+ return -EFAULT;
+
+ if (bpf_subprog_is_global(env, subprog)) {
+ /* check that jump history doesn't have any
+ * extra instructions from subprog; the next
+ * instruction after call to global subprog
+ * should be literally next instruction in
+ * caller program
+ */
+ verifier_bug_if(idx + 1 != subseq_idx, env,
+ "extra insn from subprog");
+ /* r1-r5 are invalidated after subprog call,
+ * so for global func call it shouldn't be set
+ * anymore
+ */
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+ verifier_bug(env, "global subprog unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+ /* global subprog always sets R0 */
+ bt_clear_reg(bt, BPF_REG_0);
+ return 0;
+ } else {
+ /* static subprog call instruction, which
+ * means that we are exiting current subprog,
+ * so only r1-r5 could be still requested as
+ * precise, r0 and r6-r10 or any stack slot in
+ * the current frame should be zero by now
+ */
+ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
+ verifier_bug(env, "static subprog unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+ /* we are now tracking register spills correctly,
+ * so any instance of leftover slots is a bug
+ */
+ if (bt_stack_mask(bt) != 0) {
+ verifier_bug(env,
+ "static subprog leftover stack slots %llx",
+ bt_stack_mask(bt));
+ return -EFAULT;
+ }
+ /* propagate r1-r5 to the caller */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
+ if (bt_is_reg_set(bt, i)) {
+ bt_clear_reg(bt, i);
+ bpf_bt_set_frame_reg(bt, bt->frame - 1, i);
+ }
+ }
+ if (bt_subprog_exit(bt))
+ return -EFAULT;
+ return 0;
+ }
+ } else if (bpf_is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) {
+ /* exit from callback subprog to callback-calling helper or
+ * kfunc call. Use idx/subseq_idx check to discern it from
+ * straight line code backtracking.
+ * Unlike the subprog call handling above, we shouldn't
+ * propagate precision of r1-r5 (if any requested), as they are
+ * not actually arguments passed directly to callback subprogs
+ */
+ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
+ verifier_bug(env, "callback unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+ if (bt_stack_mask(bt) != 0) {
+ verifier_bug(env, "callback leftover stack slots %llx",
+ bt_stack_mask(bt));
+ return -EFAULT;
+ }
+ /* clear r1-r5 in callback subprog's mask */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ bt_clear_reg(bt, i);
+ if (bt_subprog_exit(bt))
+ return -EFAULT;
+ return 0;
+ } else if (opcode == BPF_CALL) {
+ /* kfunc with imm==0 is invalid and fixup_kfunc_call will
+ * catch this error later. Make backtracking conservative
+ * with ENOTSUPP.
+ */
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
+ return -ENOTSUPP;
+ /* regular helper call sets R0 */
+ bt_clear_reg(bt, BPF_REG_0);
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+ /* if backtracking was looking for registers R1-R5
+ * they should have been found already.
+ */
+ verifier_bug(env, "backtracking call unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+ if (insn->src_reg == BPF_REG_0 && insn->imm == BPF_FUNC_tail_call
+ && subseq_idx - idx != 1) {
+ if (bt_subprog_enter(bt))
+ return -EFAULT;
+ }
+ } else if (opcode == BPF_EXIT) {
+ bool r0_precise;
+
+ /* Backtracking to a nested function call, 'idx' is a part of
+ * the inner frame 'subseq_idx' is a part of the outer frame.
+ * In case of a regular function call, instructions giving
+ * precision to registers R1-R5 should have been found already.
+ * In case of a callback, it is ok to have R1-R5 marked for
+ * backtracking, as these registers are set by the function
+ * invoking callback.
+ */
+ if (subseq_idx >= 0 && bpf_calls_callback(env, subseq_idx))
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ bt_clear_reg(bt, i);
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+ verifier_bug(env, "backtracking exit unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+
+ /* BPF_EXIT in subprog or callback always returns
+ * right after the call instruction, so by checking
+ * whether the instruction at subseq_idx-1 is subprog
+ * call or not we can distinguish actual exit from
+ * *subprog* from exit from *callback*. In the former
+ * case, we need to propagate r0 precision, if
+ * necessary. In the former we never do that.
+ */
+ r0_precise = subseq_idx - 1 >= 0 &&
+ bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) &&
+ bt_is_reg_set(bt, BPF_REG_0);
+
+ bt_clear_reg(bt, BPF_REG_0);
+ if (bt_subprog_enter(bt))
+ return -EFAULT;
+
+ if (r0_precise)
+ bt_set_reg(bt, BPF_REG_0);
+ /* r6-r9 and stack slots will stay set in caller frame
+ * bitmasks until we return back from callee(s)
+ */
+ return 0;
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg))
+ return 0;
+ /* dreg <cond> sreg
+ * Both dreg and sreg need precision before
+ * this insn. If only sreg was marked precise
+ * before it would be equally necessary to
+ * propagate it to dreg.
+ */
+ if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK))
+ bt_set_reg(bt, sreg);
+ if (!hist || !(hist->flags & INSN_F_DST_REG_STACK))
+ bt_set_reg(bt, dreg);
+ } else if (BPF_SRC(insn->code) == BPF_K) {
+ /* dreg <cond> K
+ * Only dreg still needs precision before
+ * this insn, so for the K-based conditional
+ * there is nothing new to be marked.
+ */
+ }
+ } else if (class == BPF_LD) {
+ if (!bt_is_reg_set(bt, dreg))
+ return 0;
+ bt_clear_reg(bt, dreg);
+ /* It's ld_imm64 or ld_abs or ld_ind.
+ * For ld_imm64 no further tracking of precision
+ * into parent is necessary
+ */
+ if (mode == BPF_IND || mode == BPF_ABS)
+ /* to be analyzed */
+ return -ENOTSUPP;
+ }
+ /* Propagate precision marks to linked registers, to account for
+ * registers marked as precise in this function.
+ */
+ bpf_bt_sync_linked_regs(bt, hist);
+ return 0;
+}
+
+/* the scalar precision tracking algorithm:
+ * . at the start all registers have precise=false.
+ * . scalar ranges are tracked as normal through alu and jmp insns.
+ * . once precise value of the scalar register is used in:
+ * . ptr + scalar alu
+ * . if (scalar cond K|scalar)
+ * . helper_call(.., scalar, ...) where ARG_CONST is expected
+ * backtrack through the verifier states and mark all registers and
+ * stack slots with spilled constants that these scalar registers
+ * should be precise.
+ * . during state pruning two registers (or spilled stack slots)
+ * are equivalent if both are not precise.
+ *
+ * Note the verifier cannot simply walk register parentage chain,
+ * since many different registers and stack slots could have been
+ * used to compute single precise scalar.
+ *
+ * The approach of starting with precise=true for all registers and then
+ * backtrack to mark a register as not precise when the verifier detects
+ * that program doesn't care about specific value (e.g., when helper
+ * takes register as ARG_ANYTHING parameter) is not safe.
+ *
+ * It's ok to walk single parentage chain of the verifier states.
+ * It's possible that this backtracking will go all the way till 1st insn.
+ * All other branches will be explored for needing precision later.
+ *
+ * The backtracking needs to deal with cases like:
+ * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0)
+ * r9 -= r8
+ * r5 = r9
+ * if r5 > 0x79f goto pc+7
+ * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff))
+ * r5 += 1
+ * ...
+ * call bpf_perf_event_output#25
+ * where .arg5_type = ARG_CONST_SIZE_OR_ZERO
+ *
+ * and this case:
+ * r6 = 1
+ * call foo // uses callee's r6 inside to compute r0
+ * r0 += r6
+ * if r0 == 0 goto
+ *
+ * to track above reg_mask/stack_mask needs to be independent for each frame.
+ *
+ * Also if parent's curframe > frame where backtracking started,
+ * the verifier need to mark registers in both frames, otherwise callees
+ * may incorrectly prune callers. This is similar to
+ * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences")
+ *
+ * For now backtracking falls back into conservative marking.
+ */
+void bpf_mark_all_scalars_precise(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
+{
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ int i, j;
+
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n",
+ st->curframe);
+ }
+
+ /* big hammer: mark all scalars precise in this path.
+ * pop_stack may still get !precise scalars.
+ * We also skip current state and go straight to first parent state,
+ * because precision markings in current non-checkpointed state are
+ * not needed. See why in the comment in __mark_chain_precision below.
+ */
+ for (st = st->parent; st; st = st->parent) {
+ for (i = 0; i <= st->curframe; i++) {
+ func = st->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ if (reg->type != SCALAR_VALUE || reg->precise)
+ continue;
+ reg->precise = true;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "force_precise: frame%d: forcing r%d to be precise\n",
+ i, j);
+ }
+ }
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (!bpf_is_spilled_reg(&func->stack[j]))
+ continue;
+ reg = &func->stack[j].spilled_ptr;
+ if (reg->type != SCALAR_VALUE || reg->precise)
+ continue;
+ reg->precise = true;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n",
+ i, -(j + 1) * 8);
+ }
+ }
+ }
+ }
+}
+
+/*
+ * bpf_mark_chain_precision() backtracks BPF program instruction sequence and
+ * chain of verifier states making sure that register *regno* (if regno >= 0)
+ * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
+ * SCALARS, as well as any other registers and slots that contribute to
+ * a tracked state of given registers/stack slots, depending on specific BPF
+ * assembly instructions (see backtrack_insns() for exact instruction handling
+ * logic). This backtracking relies on recorded jmp_history and is able to
+ * traverse entire chain of parent states. This process ends only when all the
+ * necessary registers/slots and their transitive dependencies are marked as
+ * precise.
+ *
+ * One important and subtle aspect is that precise marks *do not matter* in
+ * the currently verified state (current state). It is important to understand
+ * why this is the case.
+ *
+ * First, note that current state is the state that is not yet "checkpointed",
+ * i.e., it is not yet put into env->explored_states, and it has no children
+ * states as well. It's ephemeral, and can end up either a) being discarded if
+ * compatible explored state is found at some point or BPF_EXIT instruction is
+ * reached or b) checkpointed and put into env->explored_states, branching out
+ * into one or more children states.
+ *
+ * In the former case, precise markings in current state are completely
+ * ignored by state comparison code (see regsafe() for details). Only
+ * checkpointed ("old") state precise markings are important, and if old
+ * state's register/slot is precise, regsafe() assumes current state's
+ * register/slot as precise and checks value ranges exactly and precisely. If
+ * states turn out to be compatible, current state's necessary precise
+ * markings and any required parent states' precise markings are enforced
+ * after the fact with propagate_precision() logic, after the fact. But it's
+ * important to realize that in this case, even after marking current state
+ * registers/slots as precise, we immediately discard current state. So what
+ * actually matters is any of the precise markings propagated into current
+ * state's parent states, which are always checkpointed (due to b) case above).
+ * As such, for scenario a) it doesn't matter if current state has precise
+ * markings set or not.
+ *
+ * Now, for the scenario b), checkpointing and forking into child(ren)
+ * state(s). Note that before current state gets to checkpointing step, any
+ * processed instruction always assumes precise SCALAR register/slot
+ * knowledge: if precise value or range is useful to prune jump branch, BPF
+ * verifier takes this opportunity enthusiastically. Similarly, when
+ * register's value is used to calculate offset or memory address, exact
+ * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
+ * what we mentioned above about state comparison ignoring precise markings
+ * during state comparison, BPF verifier ignores and also assumes precise
+ * markings *at will* during instruction verification process. But as verifier
+ * assumes precision, it also propagates any precision dependencies across
+ * parent states, which are not yet finalized, so can be further restricted
+ * based on new knowledge gained from restrictions enforced by their children
+ * states. This is so that once those parent states are finalized, i.e., when
+ * they have no more active children state, state comparison logic in
+ * is_state_visited() would enforce strict and precise SCALAR ranges, if
+ * required for correctness.
+ *
+ * To build a bit more intuition, note also that once a state is checkpointed,
+ * the path we took to get to that state is not important. This is crucial
+ * property for state pruning. When state is checkpointed and finalized at
+ * some instruction index, it can be correctly and safely used to "short
+ * circuit" any *compatible* state that reaches exactly the same instruction
+ * index. I.e., if we jumped to that instruction from a completely different
+ * code path than original finalized state was derived from, it doesn't
+ * matter, current state can be discarded because from that instruction
+ * forward having a compatible state will ensure we will safely reach the
+ * exit. States describe preconditions for further exploration, but completely
+ * forget the history of how we got here.
+ *
+ * This also means that even if we needed precise SCALAR range to get to
+ * finalized state, but from that point forward *that same* SCALAR register is
+ * never used in a precise context (i.e., it's precise value is not needed for
+ * correctness), it's correct and safe to mark such register as "imprecise"
+ * (i.e., precise marking set to false). This is what we rely on when we do
+ * not set precise marking in current state. If no child state requires
+ * precision for any given SCALAR register, it's safe to dictate that it can
+ * be imprecise. If any child state does require this register to be precise,
+ * we'll mark it precise later retroactively during precise markings
+ * propagation from child state to parent states.
+ *
+ * Skipping precise marking setting in current state is a mild version of
+ * relying on the above observation. But we can utilize this property even
+ * more aggressively by proactively forgetting any precise marking in the
+ * current state (which we inherited from the parent state), right before we
+ * checkpoint it and branch off into new child state. This is done by
+ * mark_all_scalars_imprecise() to hopefully get more permissive and generic
+ * finalized states which help in short circuiting more future states.
+ */
+int bpf_mark_chain_precision(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *starting_state,
+ int regno,
+ bool *changed)
+{
+ struct bpf_verifier_state *st = starting_state;
+ struct backtrack_state *bt = &env->bt;
+ int first_idx = st->first_insn_idx;
+ int last_idx = starting_state->insn_idx;
+ int subseq_idx = -1;
+ struct bpf_func_state *func;
+ bool tmp, skip_first = true;
+ struct bpf_reg_state *reg;
+ int i, fr, err;
+
+ if (!env->bpf_capable)
+ return 0;
+
+ changed = changed ?: &tmp;
+ /* set frame number from which we are starting to backtrack */
+ bt_init(bt, starting_state->curframe);
+
+ /* Do sanity checks against current state of register and/or stack
+ * slot, but don't set precise flag in current state, as precision
+ * tracking in the current state is unnecessary.
+ */
+ func = st->frame[bt->frame];
+ if (regno >= 0) {
+ reg = &func->regs[regno];
+ if (reg->type != SCALAR_VALUE) {
+ verifier_bug(env, "backtracking misuse");
+ return -EFAULT;
+ }
+ bt_set_reg(bt, regno);
+ }
+
+ if (bt_empty(bt))
+ return 0;
+
+ for (;;) {
+ DECLARE_BITMAP(mask, 64);
+ u32 history = st->jmp_history_cnt;
+ struct bpf_jmp_history_entry *hist;
+
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
+ bt->frame, last_idx, first_idx, subseq_idx);
+ }
+
+ if (last_idx < 0) {
+ /* we are at the entry into subprog, which
+ * is expected for global funcs, but only if
+ * requested precise registers are R1-R5
+ * (which are global func's input arguments)
+ */
+ if (st->curframe == 0 &&
+ st->frame[0]->subprogno > 0 &&
+ st->frame[0]->callsite == BPF_MAIN_FUNC &&
+ bt_stack_mask(bt) == 0 &&
+ (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) {
+ bitmap_from_u64(mask, bt_reg_mask(bt));
+ for_each_set_bit(i, mask, 32) {
+ reg = &st->frame[0]->regs[i];
+ bt_clear_reg(bt, i);
+ if (reg->type == SCALAR_VALUE) {
+ reg->precise = true;
+ *changed = true;
+ }
+ }
+ return 0;
+ }
+
+ verifier_bug(env, "backtracking func entry subprog %d reg_mask %x stack_mask %llx",
+ st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
+ return -EFAULT;
+ }
+
+ for (i = last_idx;;) {
+ if (skip_first) {
+ err = 0;
+ skip_first = false;
+ } else {
+ hist = get_jmp_hist_entry(st, history, i);
+ err = backtrack_insn(env, i, subseq_idx, hist, bt);
+ }
+ if (err == -ENOTSUPP) {
+ bpf_mark_all_scalars_precise(env, starting_state);
+ bt_reset(bt);
+ return 0;
+ } else if (err) {
+ return err;
+ }
+ if (bt_empty(bt))
+ /* Found assignment(s) into tracked register in this state.
+ * Since this state is already marked, just return.
+ * Nothing to be tracked further in the parent state.
+ */
+ return 0;
+ subseq_idx = i;
+ i = get_prev_insn_idx(st, i, &history);
+ if (i == -ENOENT)
+ break;
+ if (i >= env->prog->len) {
+ /* This can happen if backtracking reached insn 0
+ * and there are still reg_mask or stack_mask
+ * to backtrack.
+ * It means the backtracking missed the spot where
+ * particular register was initialized with a constant.
+ */
+ verifier_bug(env, "backtracking idx %d", i);
+ return -EFAULT;
+ }
+ }
+ st = st->parent;
+ if (!st)
+ break;
+
+ for (fr = bt->frame; fr >= 0; fr--) {
+ func = st->frame[fr];
+ bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
+ for_each_set_bit(i, mask, 32) {
+ reg = &func->regs[i];
+ if (reg->type != SCALAR_VALUE) {
+ bt_clear_frame_reg(bt, fr, i);
+ continue;
+ }
+ if (reg->precise) {
+ bt_clear_frame_reg(bt, fr, i);
+ } else {
+ reg->precise = true;
+ *changed = true;
+ }
+ }
+
+ bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
+ for_each_set_bit(i, mask, 64) {
+ if (verifier_bug_if(i >= func->allocated_stack / BPF_REG_SIZE,
+ env, "stack slot %d, total slots %d",
+ i, func->allocated_stack / BPF_REG_SIZE))
+ return -EFAULT;
+
+ if (!bpf_is_spilled_scalar_reg(&func->stack[i])) {
+ bt_clear_frame_slot(bt, fr, i);
+ continue;
+ }
+ reg = &func->stack[i].spilled_ptr;
+ if (reg->precise) {
+ bt_clear_frame_slot(bt, fr, i);
+ } else {
+ reg->precise = true;
+ *changed = true;
+ }
+ }
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
+ bt_frame_reg_mask(bt, fr));
+ verbose(env, "mark_precise: frame%d: parent state regs=%s ",
+ fr, env->tmp_str_buf);
+ bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
+ bt_frame_stack_mask(bt, fr));
+ verbose(env, "stack=%s: ", env->tmp_str_buf);
+ print_verifier_state(env, st, fr, true);
+ }
+ }
+
+ if (bt_empty(bt))
+ return 0;
+
+ subseq_idx = first_idx;
+ last_idx = st->last_insn_idx;
+ first_idx = st->first_insn_idx;
+ }
+
+ /* if we still have requested precise regs or slots, we missed
+ * something (e.g., stack access through non-r10 register), so
+ * fallback to marking all precise
+ */
+ if (!bt_empty(bt)) {
+ bpf_mark_all_scalars_precise(env, starting_state);
+ bt_reset(bt);
+ }
+
+ return 0;
+}
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index c2a2ead1f466..c76e9b0fabba 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -76,7 +76,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
return PTR_ERR(cgroup);
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
- value, map_flags, false, GFP_ATOMIC);
+ value, map_flags, false);
cgroup_put(cgroup);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -114,7 +114,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &cgroup_cache, true);
+ return bpf_local_storage_map_alloc(attr, &cgroup_cache);
}
static void cgroup_storage_map_free(struct bpf_map *map)
@@ -122,9 +122,8 @@ static void cgroup_storage_map_free(struct bpf_map *map)
bpf_local_storage_map_free(map, &cgroup_cache);
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
- void *, value, u64, flags, gfp_t, gfp_flags)
+BPF_CALL_4(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
+ void *, value, u64, flags)
{
struct bpf_local_storage_data *sdata;
@@ -143,7 +142,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
- value, BPF_NOEXIST, false, gfp_flags);
+ value, BPF_NOEXIST, false);
out:
return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index e86734609f3d..0da8d923e39d 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -98,7 +98,7 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
sdata = bpf_local_storage_update(file_inode(fd_file(f)),
(struct bpf_local_storage_map *)map,
- value, map_flags, false, GFP_ATOMIC);
+ value, map_flags, false);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -122,9 +122,8 @@ static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
return inode_storage_delete(file_inode(fd_file(f)), map);
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
- void *, value, u64, flags, gfp_t, gfp_flags)
+BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
+ void *, value, u64, flags)
{
struct bpf_local_storage_data *sdata;
@@ -150,7 +149,7 @@ BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
sdata = bpf_local_storage_update(
inode, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST, false, gfp_flags);
+ BPF_NOEXIST, false);
return IS_ERR(sdata) ? (unsigned long)NULL :
(unsigned long)sdata->data;
}
@@ -179,7 +178,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &inode_cache, false);
+ return bpf_local_storage_map_alloc(attr, &inode_cache);
}
static void inode_storage_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 9c96a4477f81..6fc6a4b672b5 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -68,25 +68,19 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
- void *value, bool swap_uptrs, gfp_t gfp_flags)
+ void *value, bool swap_uptrs)
{
struct bpf_local_storage_elem *selem;
if (mem_charge(smap, owner, smap->elem_size))
return NULL;
- if (smap->use_kmalloc_nolock) {
- selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
- __GFP_ZERO, NUMA_NO_NODE);
- } else {
- selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
- gfp_flags | __GFP_NOWARN);
- }
+ selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
+ __GFP_ZERO, NUMA_NO_NODE);
if (selem) {
RCU_INIT_POINTER(SDATA(selem)->smap, smap);
atomic_set(&selem->state, 0);
- selem->use_kmalloc_nolock = smap->use_kmalloc_nolock;
if (value) {
/* No need to call check_and_init_map_value as memory is zero init */
@@ -102,8 +96,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
}
-/* rcu tasks trace callback for use_kmalloc_nolock == false */
-static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
+static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
@@ -115,47 +108,14 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
kfree(local_storage);
}
-/* Handle use_kmalloc_nolock == false */
-static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
- bool vanilla_rcu)
-{
- if (vanilla_rcu)
- kfree_rcu(local_storage, rcu);
- else
- call_rcu_tasks_trace(&local_storage->rcu,
- __bpf_local_storage_free_trace_rcu);
-}
-
-static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
-{
- struct bpf_local_storage *local_storage;
-
- local_storage = container_of(rcu, struct bpf_local_storage, rcu);
- kfree_nolock(local_storage);
-}
-
-static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
-{
- /*
- * RCU Tasks Trace grace period implies RCU grace period, do
- * kfree() directly.
- */
- bpf_local_storage_free_rcu(rcu);
-}
-
static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
bool reuse_now)
{
if (!local_storage)
return;
- if (!local_storage->use_kmalloc_nolock) {
- __bpf_local_storage_free(local_storage, reuse_now);
- return;
- }
-
if (reuse_now) {
- call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+ kfree_rcu(local_storage, rcu);
return;
}
@@ -163,42 +123,7 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
bpf_local_storage_free_trace_rcu);
}
-/* rcu callback for use_kmalloc_nolock == false */
-static void __bpf_selem_free_rcu(struct rcu_head *rcu)
-{
- struct bpf_local_storage_elem *selem;
- struct bpf_local_storage_map *smap;
-
- selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
- /* bpf_selem_unlink_nofail may have already cleared smap and freed fields. */
- smap = rcu_dereference_check(SDATA(selem)->smap, 1);
-
- if (smap)
- bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
- kfree(selem);
-}
-
-/* rcu tasks trace callback for use_kmalloc_nolock == false */
-static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
-{
- /*
- * RCU Tasks Trace grace period implies RCU grace period, do
- * kfree() directly.
- */
- __bpf_selem_free_rcu(rcu);
-}
-
-/* Handle use_kmalloc_nolock == false */
-static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
- bool vanilla_rcu)
-{
- if (vanilla_rcu)
- call_rcu(&selem->rcu, __bpf_selem_free_rcu);
- else
- call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
-}
-
-static void bpf_selem_free_rcu(struct rcu_head *rcu)
+static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
struct bpf_local_storage_map *smap;
@@ -209,37 +134,24 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
if (smap)
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
- kfree_nolock(selem);
-}
-
-static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
-{
/*
* RCU Tasks Trace grace period implies RCU grace period, do
* kfree() directly.
*/
- bpf_selem_free_rcu(rcu);
+ kfree(selem);
}
void bpf_selem_free(struct bpf_local_storage_elem *selem,
bool reuse_now)
{
- if (!selem->use_kmalloc_nolock) {
- /*
- * No uptr will be unpin even when reuse_now == false since uptr
- * is only supported in task local storage, where
- * smap->use_kmalloc_nolock == true.
- */
- __bpf_selem_free(selem, reuse_now);
- return;
- }
+ struct bpf_local_storage_map *smap;
+
+ smap = rcu_dereference_check(SDATA(selem)->smap, 1);
if (reuse_now) {
- /*
- * While it is okay to call bpf_obj_free_fields() that unpins uptr when
- * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity.
- */
- call_rcu(&selem->rcu, bpf_selem_free_rcu);
+ if (smap)
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ kfree_rcu(selem, rcu);
return;
}
@@ -393,6 +305,9 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
unsigned long flags;
int err;
+ if (in_nmi())
+ return -EOPNOTSUPP;
+
if (unlikely(!selem_linked_to_storage_lockless(selem)))
/* selem has already been unlinked from sk */
return 0;
@@ -494,6 +409,14 @@ static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem,
}
raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
}
+ /*
+ * Highly unlikely scenario: memory leak
+ *
+ * When destroy() fails to acqurire local_storage->lock and initializes
+ * selem->local_storage to NULL before any racing map_free() sees the same
+ * selem, no one will free the local storage.
+ */
+ WARN_ON_ONCE(err && !in_map_free);
if (!err || !in_map_free)
RCU_INIT_POINTER(selem->local_storage, NULL);
}
@@ -552,8 +475,7 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata,
int bpf_local_storage_alloc(void *owner,
struct bpf_local_storage_map *smap,
- struct bpf_local_storage_elem *first_selem,
- gfp_t gfp_flags)
+ struct bpf_local_storage_elem *first_selem)
{
struct bpf_local_storage *prev_storage, *storage;
struct bpf_local_storage **owner_storage_ptr;
@@ -565,12 +487,8 @@ int bpf_local_storage_alloc(void *owner,
if (err)
return err;
- if (smap->use_kmalloc_nolock)
- storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
- __GFP_ZERO, NUMA_NO_NODE);
- else
- storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
- gfp_flags | __GFP_NOWARN);
+ storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
+ __GFP_ZERO, NUMA_NO_NODE);
if (!storage) {
err = -ENOMEM;
goto uncharge;
@@ -580,7 +498,6 @@ int bpf_local_storage_alloc(void *owner,
raw_res_spin_lock_init(&storage->lock);
storage->owner = owner;
storage->mem_charge = sizeof(*storage);
- storage->use_kmalloc_nolock = smap->use_kmalloc_nolock;
refcount_set(&storage->owner_refcnt, 1);
bpf_selem_link_storage_nolock(storage, first_selem);
@@ -628,7 +545,7 @@ uncharge:
*/
struct bpf_local_storage_data *
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
- void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags)
+ void *value, u64 map_flags, bool swap_uptrs)
{
struct bpf_local_storage_data *old_sdata = NULL;
struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
@@ -645,9 +562,6 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
!btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)))
return ERR_PTR(-EINVAL);
- if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
- return ERR_PTR(-EINVAL);
-
local_storage = rcu_dereference_check(*owner_storage(smap, owner),
bpf_rcu_lock_held());
if (!local_storage || hlist_empty(&local_storage->list)) {
@@ -656,11 +570,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (err)
return ERR_PTR(err);
- selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
+ selem = bpf_selem_alloc(smap, owner, value, swap_uptrs);
if (!selem)
return ERR_PTR(-ENOMEM);
- err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
+ err = bpf_local_storage_alloc(owner, smap, selem);
if (err) {
bpf_selem_free(selem, true);
mem_uncharge(smap, owner, smap->elem_size);
@@ -690,7 +604,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
/* A lookup has just been done before and concluded a new selem is
* needed. The chance of an unnecessary alloc is unlikely.
*/
- alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
+ alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs);
if (!alloc_selem)
return ERR_PTR(-ENOMEM);
@@ -857,8 +771,7 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr *attr,
- struct bpf_local_storage_cache *cache,
- bool use_kmalloc_nolock)
+ struct bpf_local_storage_cache *cache)
{
struct bpf_local_storage_map *smap;
unsigned int i;
@@ -890,12 +803,6 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
smap->elem_size = offsetof(struct bpf_local_storage_elem,
sdata.data[attr->value_size]);
- /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non
- * preemptible context. Thus, enforce all storages to use
- * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled.
- */
- smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock;
-
smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
return &smap->map;
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 0c4a0c8e6f70..c5c925f00202 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -359,8 +359,6 @@ BTF_ID(func, bpf_lsm_sb_umount)
BTF_ID(func, bpf_lsm_settime)
#ifdef CONFIG_SECURITY_NETWORK
-BTF_ID(func, bpf_lsm_inet_conn_established)
-
BTF_ID(func, bpf_lsm_socket_accept)
BTF_ID(func, bpf_lsm_socket_bind)
BTF_ID(func, bpf_lsm_socket_connect)
@@ -381,8 +379,9 @@ BTF_ID(func, bpf_lsm_syslog)
BTF_ID(func, bpf_lsm_task_alloc)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
-BTF_ID(func, bpf_lsm_task_to_inode)
BTF_ID(func, bpf_lsm_userns_create)
+BTF_ID(func, bpf_lsm_bdev_alloc_security)
+BTF_ID(func, bpf_lsm_bdev_setintegrity)
BTF_SET_END(sleepable_lsm_hooks)
BTF_SET_START(untrusted_lsm_hooks)
@@ -395,6 +394,8 @@ BTF_ID(func, bpf_lsm_sk_alloc_security)
BTF_ID(func, bpf_lsm_sk_free_security)
#endif /* CONFIG_SECURITY_NETWORK */
BTF_ID(func, bpf_lsm_task_free)
+BTF_ID(func, bpf_lsm_bdev_alloc_security)
+BTF_ID(func, bpf_lsm_bdev_free_security)
BTF_SET_END(untrusted_lsm_hooks)
bool bpf_lsm_is_sleepable_hook(u32 btf_id)
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 605506792b5b..4b342be29eac 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -118,7 +118,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value, map_flags,
- true, GFP_ATOMIC);
+ true);
err = PTR_ERR_OR_ZERO(sdata);
out:
@@ -165,9 +165,8 @@ out:
return err;
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
- task, void *, value, u64, flags, gfp_t, gfp_flags)
+BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags)
{
struct bpf_local_storage_data *sdata;
@@ -184,7 +183,7 @@ BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) {
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST, false, gfp_flags);
+ BPF_NOEXIST, false);
return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
}
@@ -212,7 +211,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &task_cache, true);
+ return bpf_local_storage_map_alloc(attr, &task_cache);
}
static void task_storage_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 71f9143fe90f..a62d78581207 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -270,6 +270,7 @@ struct btf {
struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
struct btf_struct_metas *struct_meta_tab;
struct btf_struct_ops_tab *struct_ops_tab;
+ struct btf_layout *layout;
/* split BTF support */
struct btf *base_btf;
@@ -1707,6 +1708,11 @@ static void btf_verifier_log_hdr(struct btf_verifier_env *env,
__btf_verifier_log(log, "type_len: %u\n", hdr->type_len);
__btf_verifier_log(log, "str_off: %u\n", hdr->str_off);
__btf_verifier_log(log, "str_len: %u\n", hdr->str_len);
+ if (hdr->hdr_len >= sizeof(struct btf_header) &&
+ btf_data_size >= hdr->hdr_len) {
+ __btf_verifier_log(log, "layout_off: %u\n", hdr->layout_off);
+ __btf_verifier_log(log, "layout_len: %u\n", hdr->layout_len);
+ }
__btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size);
}
@@ -5526,7 +5532,8 @@ static int btf_parse_str_sec(struct btf_verifier_env *env)
start = btf->nohdr_data + hdr->str_off;
end = start + hdr->str_len;
- if (end != btf->data + btf->data_size) {
+ if (hdr->hdr_len < sizeof(struct btf_header) &&
+ end != btf->data + btf->data_size) {
btf_verifier_log(env, "String section is not at the end");
return -EINVAL;
}
@@ -5547,9 +5554,46 @@ static int btf_parse_str_sec(struct btf_verifier_env *env)
return 0;
}
+static int btf_parse_layout_sec(struct btf_verifier_env *env)
+{
+ const struct btf_header *hdr = &env->btf->hdr;
+ struct btf *btf = env->btf;
+ void *start, *end;
+
+ if (hdr->hdr_len < sizeof(struct btf_header) ||
+ hdr->layout_len == 0)
+ return 0;
+
+ /* Layout section must align to 4 bytes */
+ if (hdr->layout_off & (sizeof(u32) - 1)) {
+ btf_verifier_log(env, "Unaligned layout_off");
+ return -EINVAL;
+ }
+ start = btf->nohdr_data + hdr->layout_off;
+ end = start + hdr->layout_len;
+
+ if (hdr->layout_len < sizeof(struct btf_layout)) {
+ btf_verifier_log(env, "Layout section is too small");
+ return -EINVAL;
+ }
+ if (hdr->layout_len % sizeof(struct btf_layout) != 0) {
+ btf_verifier_log(env, "layout_len is not multiple of %zu",
+ sizeof(struct btf_layout));
+ return -EINVAL;
+ }
+ if (end > btf->data + btf->data_size) {
+ btf_verifier_log(env, "Layout section is too big");
+ return -EINVAL;
+ }
+ btf->layout = start;
+
+ return 0;
+}
+
static const size_t btf_sec_info_offset[] = {
offsetof(struct btf_header, type_off),
offsetof(struct btf_header, str_off),
+ offsetof(struct btf_header, layout_off)
};
static int btf_sec_info_cmp(const void *a, const void *b)
@@ -5565,24 +5609,28 @@ static int btf_check_sec_info(struct btf_verifier_env *env,
{
struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)];
u32 total, expected_total, i;
+ u32 nr_secs = ARRAY_SIZE(btf_sec_info_offset);
const struct btf_header *hdr;
const struct btf *btf;
btf = env->btf;
hdr = &btf->hdr;
+ if (hdr->hdr_len < sizeof(struct btf_header) || hdr->layout_len == 0)
+ nr_secs--;
+
/* Populate the secs from hdr */
- for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++)
+ for (i = 0; i < nr_secs; i++)
secs[i] = *(struct btf_sec_info *)((void *)hdr +
btf_sec_info_offset[i]);
- sort(secs, ARRAY_SIZE(btf_sec_info_offset),
+ sort(secs, nr_secs,
sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL);
/* Check for gaps and overlap among sections */
total = 0;
expected_total = btf_data_size - hdr->hdr_len;
- for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) {
+ for (i = 0; i < nr_secs; i++) {
if (expected_total < secs[i].off) {
btf_verifier_log(env, "Invalid section offset");
return -EINVAL;
@@ -5938,6 +5986,10 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
if (err)
goto errout;
+ err = btf_parse_layout_sec(env);
+ if (err)
+ goto errout;
+
err = btf_parse_type_sec(env);
if (err)
goto errout;
@@ -6517,13 +6569,6 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
return prog->aux->attach_btf;
}
-static bool is_void_or_int_ptr(struct btf *btf, const struct btf_type *t)
-{
- /* skip modifiers */
- t = btf_type_skip_modifiers(btf, t->type, NULL);
- return btf_type_is_void(t) || btf_type_is_int(t);
-}
-
u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
int off)
{
@@ -6912,10 +6957,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
/*
- * If it's a pointer to void, it's the same as scalar from the verifier
- * safety POV. Either way, no futher pointer walking is allowed.
+ * If it's a single or multilevel pointer, except a pointer
+ * to a structure, it's the same as scalar from the verifier
+ * safety POV. Multilevel pointers to structures are treated as
+ * scalars. The verifier lacks the context to infer the size of
+ * their target memory regions. Either way, no further pointer
+ * walking is allowed.
*/
- if (is_void_or_int_ptr(btf, t))
+ if (!btf_type_is_struct_ptr(btf, t))
return true;
/* this is a pointer to another type */
@@ -7845,15 +7894,16 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
tname, nargs, MAX_BPF_FUNC_REG_ARGS);
return -EINVAL;
}
- /* check that function returns int, exception cb also requires this */
+ /* check that function is void or returns int, exception cb also requires this */
t = btf_type_by_id(btf, t->type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (!btf_type_is_int(t) && !btf_is_any_enum(t)) {
+ if (!btf_type_is_void(t) && !btf_type_is_int(t) && !btf_is_any_enum(t)) {
if (!is_global)
return -EINVAL;
bpf_log(log,
- "Global function %s() doesn't return scalar. Only those are supported.\n",
+ "Global function %s() return value not void or scalar. "
+ "Only those are supported.\n",
tname);
return -EINVAL;
}
@@ -9019,7 +9069,7 @@ static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc
if (!t || !btf_type_is_ptr(t))
return -EINVAL;
- if (IS_ENABLED(CONFIG_CFI_CLANG)) {
+ if (IS_ENABLED(CONFIG_CFI)) {
/* Ensure the destructor kfunc type matches btf_dtor_kfunc_t */
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_void(t))
diff --git a/kernel/bpf/cfg.c b/kernel/bpf/cfg.c
new file mode 100644
index 000000000000..998f42a8189a
--- /dev/null
+++ b/kernel/bpf/cfg.c
@@ -0,0 +1,872 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/filter.h>
+#include <linux/sort.h>
+
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
+
+/* non-recursive DFS pseudo code
+ * 1 procedure DFS-iterative(G,v):
+ * 2 label v as discovered
+ * 3 let S be a stack
+ * 4 S.push(v)
+ * 5 while S is not empty
+ * 6 t <- S.peek()
+ * 7 if t is what we're looking for:
+ * 8 return t
+ * 9 for all edges e in G.adjacentEdges(t) do
+ * 10 if edge e is already labelled
+ * 11 continue with the next edge
+ * 12 w <- G.adjacentVertex(t,e)
+ * 13 if vertex w is not discovered and not explored
+ * 14 label e as tree-edge
+ * 15 label w as discovered
+ * 16 S.push(w)
+ * 17 continue at 5
+ * 18 else if vertex w is discovered
+ * 19 label e as back-edge
+ * 20 else
+ * 21 // vertex w is explored
+ * 22 label e as forward- or cross-edge
+ * 23 label t as explored
+ * 24 S.pop()
+ *
+ * convention:
+ * 0x10 - discovered
+ * 0x11 - discovered and fall-through edge labelled
+ * 0x12 - discovered and fall-through and branch edges labelled
+ * 0x20 - explored
+ */
+
+enum {
+ DISCOVERED = 0x10,
+ EXPLORED = 0x20,
+ FALLTHROUGH = 1,
+ BRANCH = 2,
+};
+
+
+static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *subprog;
+
+ subprog = bpf_find_containing_subprog(env, off);
+ subprog->changes_pkt_data = true;
+}
+
+static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *subprog;
+
+ subprog = bpf_find_containing_subprog(env, off);
+ subprog->might_sleep = true;
+}
+
+/* 't' is an index of a call-site.
+ * 'w' is a callee entry point.
+ * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED.
+ * Rely on DFS traversal order and absence of recursive calls to guarantee that
+ * callee's change_pkt_data marks would be correct at that moment.
+ */
+static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w)
+{
+ struct bpf_subprog_info *caller, *callee;
+
+ caller = bpf_find_containing_subprog(env, t);
+ callee = bpf_find_containing_subprog(env, w);
+ caller->changes_pkt_data |= callee->changes_pkt_data;
+ caller->might_sleep |= callee->might_sleep;
+}
+
+enum {
+ DONE_EXPLORING = 0,
+ KEEP_EXPLORING = 1,
+};
+
+/* t, w, e - match pseudo-code above:
+ * t - index of current instruction
+ * w - next instruction
+ * e - edge
+ */
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
+{
+ int *insn_stack = env->cfg.insn_stack;
+ int *insn_state = env->cfg.insn_state;
+
+ if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
+ return DONE_EXPLORING;
+
+ if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
+ return DONE_EXPLORING;
+
+ if (w < 0 || w >= env->prog->len) {
+ verbose_linfo(env, t, "%d: ", t);
+ verbose(env, "jump out of range from insn %d to %d\n", t, w);
+ return -EINVAL;
+ }
+
+ if (e == BRANCH) {
+ /* mark branch target for state pruning */
+ mark_prune_point(env, w);
+ mark_jmp_point(env, w);
+ }
+
+ if (insn_state[w] == 0) {
+ /* tree-edge */
+ insn_state[t] = DISCOVERED | e;
+ insn_state[w] = DISCOVERED;
+ if (env->cfg.cur_stack >= env->prog->len)
+ return -E2BIG;
+ insn_stack[env->cfg.cur_stack++] = w;
+ return KEEP_EXPLORING;
+ } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
+ if (env->bpf_capable)
+ return DONE_EXPLORING;
+ verbose_linfo(env, t, "%d: ", t);
+ verbose_linfo(env, w, "%d: ", w);
+ verbose(env, "back-edge from insn %d to %d\n", t, w);
+ return -EINVAL;
+ } else if (insn_state[w] == EXPLORED) {
+ /* forward- or cross-edge */
+ insn_state[t] = DISCOVERED | e;
+ } else {
+ verifier_bug(env, "insn state internal bug");
+ return -EFAULT;
+ }
+ return DONE_EXPLORING;
+}
+
+static int visit_func_call_insn(int t, struct bpf_insn *insns,
+ struct bpf_verifier_env *env,
+ bool visit_callee)
+{
+ int ret, insn_sz;
+ int w;
+
+ insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
+ ret = push_insn(t, t + insn_sz, FALLTHROUGH, env);
+ if (ret)
+ return ret;
+
+ mark_prune_point(env, t + insn_sz);
+ /* when we exit from subprog, we need to record non-linear history */
+ mark_jmp_point(env, t + insn_sz);
+
+ if (visit_callee) {
+ w = t + insns[t].imm + 1;
+ mark_prune_point(env, t);
+ merge_callee_effects(env, t, w);
+ ret = push_insn(t, w, BRANCH, env);
+ }
+ return ret;
+}
+
+struct bpf_iarray *bpf_iarray_realloc(struct bpf_iarray *old, size_t n_elem)
+{
+ size_t new_size = sizeof(struct bpf_iarray) + n_elem * sizeof(old->items[0]);
+ struct bpf_iarray *new;
+
+ new = kvrealloc(old, new_size, GFP_KERNEL_ACCOUNT);
+ if (!new) {
+ /* this is what callers always want, so simplify the call site */
+ kvfree(old);
+ return NULL;
+ }
+
+ new->cnt = n_elem;
+ return new;
+}
+
+static int copy_insn_array(struct bpf_map *map, u32 start, u32 end, u32 *items)
+{
+ struct bpf_insn_array_value *value;
+ u32 i;
+
+ for (i = start; i <= end; i++) {
+ value = map->ops->map_lookup_elem(map, &i);
+ /*
+ * map_lookup_elem of an array map will never return an error,
+ * but not checking it makes some static analysers to worry
+ */
+ if (IS_ERR(value))
+ return PTR_ERR(value);
+ else if (!value)
+ return -EINVAL;
+ items[i - start] = value->xlated_off;
+ }
+ return 0;
+}
+
+static int cmp_ptr_to_u32(const void *a, const void *b)
+{
+ return *(u32 *)a - *(u32 *)b;
+}
+
+static int sort_insn_array_uniq(u32 *items, int cnt)
+{
+ int unique = 1;
+ int i;
+
+ sort(items, cnt, sizeof(items[0]), cmp_ptr_to_u32, NULL);
+
+ for (i = 1; i < cnt; i++)
+ if (items[i] != items[unique - 1])
+ items[unique++] = items[i];
+
+ return unique;
+}
+
+/*
+ * sort_unique({map[start], ..., map[end]}) into off
+ */
+int bpf_copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off)
+{
+ u32 n = end - start + 1;
+ int err;
+
+ err = copy_insn_array(map, start, end, off);
+ if (err)
+ return err;
+
+ return sort_insn_array_uniq(off, n);
+}
+
+/*
+ * Copy all unique offsets from the map
+ */
+static struct bpf_iarray *jt_from_map(struct bpf_map *map)
+{
+ struct bpf_iarray *jt;
+ int err;
+ int n;
+
+ jt = bpf_iarray_realloc(NULL, map->max_entries);
+ if (!jt)
+ return ERR_PTR(-ENOMEM);
+
+ n = bpf_copy_insn_array_uniq(map, 0, map->max_entries - 1, jt->items);
+ if (n < 0) {
+ err = n;
+ goto err_free;
+ }
+ if (n == 0) {
+ err = -EINVAL;
+ goto err_free;
+ }
+ jt->cnt = n;
+ return jt;
+
+err_free:
+ kvfree(jt);
+ return ERR_PTR(err);
+}
+
+/*
+ * Find and collect all maps which fit in the subprog. Return the result as one
+ * combined jump table in jt->items (allocated with kvcalloc)
+ */
+static struct bpf_iarray *jt_from_subprog(struct bpf_verifier_env *env,
+ int subprog_start, int subprog_end)
+{
+ struct bpf_iarray *jt = NULL;
+ struct bpf_map *map;
+ struct bpf_iarray *jt_cur;
+ int i;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++) {
+ /*
+ * TODO (when needed): collect only jump tables, not static keys
+ * or maps for indirect calls
+ */
+ map = env->insn_array_maps[i];
+
+ jt_cur = jt_from_map(map);
+ if (IS_ERR(jt_cur)) {
+ kvfree(jt);
+ return jt_cur;
+ }
+
+ /*
+ * This is enough to check one element. The full table is
+ * checked to fit inside the subprog later in create_jt()
+ */
+ if (jt_cur->items[0] >= subprog_start && jt_cur->items[0] < subprog_end) {
+ u32 old_cnt = jt ? jt->cnt : 0;
+ jt = bpf_iarray_realloc(jt, old_cnt + jt_cur->cnt);
+ if (!jt) {
+ kvfree(jt_cur);
+ return ERR_PTR(-ENOMEM);
+ }
+ memcpy(jt->items + old_cnt, jt_cur->items, jt_cur->cnt << 2);
+ }
+
+ kvfree(jt_cur);
+ }
+
+ if (!jt) {
+ verbose(env, "no jump tables found for subprog starting at %u\n", subprog_start);
+ return ERR_PTR(-EINVAL);
+ }
+
+ jt->cnt = sort_insn_array_uniq(jt->items, jt->cnt);
+ return jt;
+}
+
+static struct bpf_iarray *
+create_jt(int t, struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprog;
+ int subprog_start, subprog_end;
+ struct bpf_iarray *jt;
+ int i;
+
+ subprog = bpf_find_containing_subprog(env, t);
+ subprog_start = subprog->start;
+ subprog_end = (subprog + 1)->start;
+ jt = jt_from_subprog(env, subprog_start, subprog_end);
+ if (IS_ERR(jt))
+ return jt;
+
+ /* Check that the every element of the jump table fits within the given subprogram */
+ for (i = 0; i < jt->cnt; i++) {
+ if (jt->items[i] < subprog_start || jt->items[i] >= subprog_end) {
+ verbose(env, "jump table for insn %d points outside of the subprog [%u,%u]\n",
+ t, subprog_start, subprog_end);
+ kvfree(jt);
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ return jt;
+}
+
+/* "conditional jump with N edges" */
+static int visit_gotox_insn(int t, struct bpf_verifier_env *env)
+{
+ int *insn_stack = env->cfg.insn_stack;
+ int *insn_state = env->cfg.insn_state;
+ bool keep_exploring = false;
+ struct bpf_iarray *jt;
+ int i, w;
+
+ jt = env->insn_aux_data[t].jt;
+ if (!jt) {
+ jt = create_jt(t, env);
+ if (IS_ERR(jt))
+ return PTR_ERR(jt);
+
+ env->insn_aux_data[t].jt = jt;
+ }
+
+ mark_prune_point(env, t);
+ for (i = 0; i < jt->cnt; i++) {
+ w = jt->items[i];
+ if (w < 0 || w >= env->prog->len) {
+ verbose(env, "indirect jump out of range from insn %d to %d\n", t, w);
+ return -EINVAL;
+ }
+
+ mark_jmp_point(env, w);
+
+ /* EXPLORED || DISCOVERED */
+ if (insn_state[w])
+ continue;
+
+ if (env->cfg.cur_stack >= env->prog->len)
+ return -E2BIG;
+
+ insn_stack[env->cfg.cur_stack++] = w;
+ insn_state[w] |= DISCOVERED;
+ keep_exploring = true;
+ }
+
+ return keep_exploring ? KEEP_EXPLORING : DONE_EXPLORING;
+}
+
+/*
+ * Instructions that can abnormally return from a subprog (tail_call
+ * upon success, ld_{abs,ind} upon load failure) have a hidden exit
+ * that the verifier must account for.
+ */
+static int visit_abnormal_return_insn(struct bpf_verifier_env *env, int t)
+{
+ struct bpf_subprog_info *subprog;
+ struct bpf_iarray *jt;
+
+ if (env->insn_aux_data[t].jt)
+ return 0;
+
+ jt = bpf_iarray_realloc(NULL, 2);
+ if (!jt)
+ return -ENOMEM;
+
+ subprog = bpf_find_containing_subprog(env, t);
+ jt->items[0] = t + 1;
+ jt->items[1] = subprog->exit_idx;
+ env->insn_aux_data[t].jt = jt;
+ return 0;
+}
+
+/* Visits the instruction at index t and returns one of the following:
+ * < 0 - an error occurred
+ * DONE_EXPLORING - the instruction was fully explored
+ * KEEP_EXPLORING - there is still work to be done before it is fully explored
+ */
+static int visit_insn(int t, struct bpf_verifier_env *env)
+{
+ struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
+ int ret, off, insn_sz;
+
+ if (bpf_pseudo_func(insn))
+ return visit_func_call_insn(t, insns, env, true);
+
+ /* All non-branch instructions have a single fall-through edge. */
+ if (BPF_CLASS(insn->code) != BPF_JMP &&
+ BPF_CLASS(insn->code) != BPF_JMP32) {
+ if (BPF_CLASS(insn->code) == BPF_LD &&
+ (BPF_MODE(insn->code) == BPF_ABS ||
+ BPF_MODE(insn->code) == BPF_IND)) {
+ ret = visit_abnormal_return_insn(env, t);
+ if (ret)
+ return ret;
+ }
+ insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
+ return push_insn(t, t + insn_sz, FALLTHROUGH, env);
+ }
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_EXIT:
+ return DONE_EXPLORING;
+
+ case BPF_CALL:
+ if (bpf_is_async_callback_calling_insn(insn))
+ /* Mark this call insn as a prune point to trigger
+ * is_state_visited() check before call itself is
+ * processed by __check_func_call(). Otherwise new
+ * async state will be pushed for further exploration.
+ */
+ mark_prune_point(env, t);
+ /* For functions that invoke callbacks it is not known how many times
+ * callback would be called. Verifier models callback calling functions
+ * by repeatedly visiting callback bodies and returning to origin call
+ * instruction.
+ * In order to stop such iteration verifier needs to identify when a
+ * state identical some state from a previous iteration is reached.
+ * Check below forces creation of checkpoint before callback calling
+ * instruction to allow search for such identical states.
+ */
+ if (bpf_is_sync_callback_calling_insn(insn)) {
+ mark_calls_callback(env, t);
+ mark_force_checkpoint(env, t);
+ mark_prune_point(env, t);
+ mark_jmp_point(env, t);
+ }
+ if (bpf_helper_call(insn)) {
+ const struct bpf_func_proto *fp;
+
+ ret = bpf_get_helper_proto(env, insn->imm, &fp);
+ /* If called in a non-sleepable context program will be
+ * rejected anyway, so we should end up with precise
+ * sleepable marks on subprogs, except for dead code
+ * elimination.
+ */
+ if (ret == 0 && fp->might_sleep)
+ mark_subprog_might_sleep(env, t);
+ if (bpf_helper_changes_pkt_data(insn->imm))
+ mark_subprog_changes_pkt_data(env, t);
+ if (insn->imm == BPF_FUNC_tail_call) {
+ ret = visit_abnormal_return_insn(env, t);
+ if (ret)
+ return ret;
+ }
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ struct bpf_kfunc_call_arg_meta meta;
+
+ ret = bpf_fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta);
+ if (ret == 0 && bpf_is_iter_next_kfunc(&meta)) {
+ mark_prune_point(env, t);
+ /* Checking and saving state checkpoints at iter_next() call
+ * is crucial for fast convergence of open-coded iterator loop
+ * logic, so we need to force it. If we don't do that,
+ * is_state_visited() might skip saving a checkpoint, causing
+ * unnecessarily long sequence of not checkpointed
+ * instructions and jumps, leading to exhaustion of jump
+ * history buffer, and potentially other undesired outcomes.
+ * It is expected that with correct open-coded iterators
+ * convergence will happen quickly, so we don't run a risk of
+ * exhausting memory.
+ */
+ mark_force_checkpoint(env, t);
+ }
+ /* Same as helpers, if called in a non-sleepable context
+ * program will be rejected anyway, so we should end up
+ * with precise sleepable marks on subprogs, except for
+ * dead code elimination.
+ */
+ if (ret == 0 && bpf_is_kfunc_sleepable(&meta))
+ mark_subprog_might_sleep(env, t);
+ if (ret == 0 && bpf_is_kfunc_pkt_changing(&meta))
+ mark_subprog_changes_pkt_data(env, t);
+ }
+ return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
+
+ case BPF_JA:
+ if (BPF_SRC(insn->code) == BPF_X)
+ return visit_gotox_insn(t, env);
+
+ if (BPF_CLASS(insn->code) == BPF_JMP)
+ off = insn->off;
+ else
+ off = insn->imm;
+
+ /* unconditional jump with single edge */
+ ret = push_insn(t, t + off + 1, FALLTHROUGH, env);
+ if (ret)
+ return ret;
+
+ mark_prune_point(env, t + off + 1);
+ mark_jmp_point(env, t + off + 1);
+
+ return ret;
+
+ default:
+ /* conditional jump with two edges */
+ mark_prune_point(env, t);
+ if (bpf_is_may_goto_insn(insn))
+ mark_force_checkpoint(env, t);
+
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret)
+ return ret;
+
+ return push_insn(t, t + insn->off + 1, BRANCH, env);
+ }
+}
+
+/* non-recursive depth-first-search to detect loops in BPF program
+ * loop == back-edge in directed graph
+ */
+int bpf_check_cfg(struct bpf_verifier_env *env)
+{
+ int insn_cnt = env->prog->len;
+ int *insn_stack, *insn_state;
+ int ex_insn_beg, i, ret = 0;
+
+ insn_state = env->cfg.insn_state = kvzalloc_objs(int, insn_cnt,
+ GFP_KERNEL_ACCOUNT);
+ if (!insn_state)
+ return -ENOMEM;
+
+ insn_stack = env->cfg.insn_stack = kvzalloc_objs(int, insn_cnt,
+ GFP_KERNEL_ACCOUNT);
+ if (!insn_stack) {
+ kvfree(insn_state);
+ return -ENOMEM;
+ }
+
+ ex_insn_beg = env->exception_callback_subprog
+ ? env->subprog_info[env->exception_callback_subprog].start
+ : 0;
+
+ insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
+ insn_stack[0] = 0; /* 0 is the first instruction */
+ env->cfg.cur_stack = 1;
+
+walk_cfg:
+ while (env->cfg.cur_stack > 0) {
+ int t = insn_stack[env->cfg.cur_stack - 1];
+
+ ret = visit_insn(t, env);
+ switch (ret) {
+ case DONE_EXPLORING:
+ insn_state[t] = EXPLORED;
+ env->cfg.cur_stack--;
+ break;
+ case KEEP_EXPLORING:
+ break;
+ default:
+ if (ret > 0) {
+ verifier_bug(env, "visit_insn internal bug");
+ ret = -EFAULT;
+ }
+ goto err_free;
+ }
+ }
+
+ if (env->cfg.cur_stack < 0) {
+ verifier_bug(env, "pop stack internal bug");
+ ret = -EFAULT;
+ goto err_free;
+ }
+
+ if (ex_insn_beg && insn_state[ex_insn_beg] != EXPLORED) {
+ insn_state[ex_insn_beg] = DISCOVERED;
+ insn_stack[0] = ex_insn_beg;
+ env->cfg.cur_stack = 1;
+ goto walk_cfg;
+ }
+
+ for (i = 0; i < insn_cnt; i++) {
+ struct bpf_insn *insn = &env->prog->insnsi[i];
+
+ if (insn_state[i] != EXPLORED) {
+ verbose(env, "unreachable insn %d\n", i);
+ ret = -EINVAL;
+ goto err_free;
+ }
+ if (bpf_is_ldimm64(insn)) {
+ if (insn_state[i + 1] != 0) {
+ verbose(env, "jump into the middle of ldimm64 insn %d\n", i);
+ ret = -EINVAL;
+ goto err_free;
+ }
+ i++; /* skip second half of ldimm64 */
+ }
+ }
+ ret = 0; /* cfg looks good */
+ env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data;
+ env->prog->aux->might_sleep = env->subprog_info[0].might_sleep;
+
+err_free:
+ kvfree(insn_state);
+ kvfree(insn_stack);
+ env->cfg.insn_state = env->cfg.insn_stack = NULL;
+ return ret;
+}
+
+/*
+ * For each subprogram 'i' fill array env->cfg.insn_subprogram sub-range
+ * [env->subprog_info[i].postorder_start, env->subprog_info[i+1].postorder_start)
+ * with indices of 'i' instructions in postorder.
+ */
+int bpf_compute_postorder(struct bpf_verifier_env *env)
+{
+ u32 cur_postorder, i, top, stack_sz, s;
+ int *stack = NULL, *postorder = NULL, *state = NULL;
+ struct bpf_iarray *succ;
+
+ postorder = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT);
+ state = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT);
+ stack = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT);
+ if (!postorder || !state || !stack) {
+ kvfree(postorder);
+ kvfree(state);
+ kvfree(stack);
+ return -ENOMEM;
+ }
+ cur_postorder = 0;
+ for (i = 0; i < env->subprog_cnt; i++) {
+ env->subprog_info[i].postorder_start = cur_postorder;
+ stack[0] = env->subprog_info[i].start;
+ stack_sz = 1;
+ do {
+ top = stack[stack_sz - 1];
+ state[top] |= DISCOVERED;
+ if (state[top] & EXPLORED) {
+ postorder[cur_postorder++] = top;
+ stack_sz--;
+ continue;
+ }
+ succ = bpf_insn_successors(env, top);
+ for (s = 0; s < succ->cnt; ++s) {
+ if (!state[succ->items[s]]) {
+ stack[stack_sz++] = succ->items[s];
+ state[succ->items[s]] |= DISCOVERED;
+ }
+ }
+ state[top] |= EXPLORED;
+ } while (stack_sz);
+ }
+ env->subprog_info[i].postorder_start = cur_postorder;
+ env->cfg.insn_postorder = postorder;
+ env->cfg.cur_postorder = cur_postorder;
+ kvfree(stack);
+ kvfree(state);
+ return 0;
+}
+
+/*
+ * Compute strongly connected components (SCCs) on the CFG.
+ * Assign an SCC number to each instruction, recorded in env->insn_aux[*].scc.
+ * If instruction is a sole member of its SCC and there are no self edges,
+ * assign it SCC number of zero.
+ * Uses a non-recursive adaptation of Tarjan's algorithm for SCC computation.
+ */
+int bpf_compute_scc(struct bpf_verifier_env *env)
+{
+ const u32 NOT_ON_STACK = U32_MAX;
+
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ const u32 insn_cnt = env->prog->len;
+ int stack_sz, dfs_sz, err = 0;
+ u32 *stack, *pre, *low, *dfs;
+ u32 i, j, t, w;
+ u32 next_preorder_num;
+ u32 next_scc_id;
+ bool assign_scc;
+ struct bpf_iarray *succ;
+
+ next_preorder_num = 1;
+ next_scc_id = 1;
+ /*
+ * - 'stack' accumulates vertices in DFS order, see invariant comment below;
+ * - 'pre[t] == p' => preorder number of vertex 't' is 'p';
+ * - 'low[t] == n' => smallest preorder number of the vertex reachable from 't' is 'n';
+ * - 'dfs' DFS traversal stack, used to emulate explicit recursion.
+ */
+ stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
+ pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
+ low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
+ dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL_ACCOUNT);
+ if (!stack || !pre || !low || !dfs) {
+ err = -ENOMEM;
+ goto exit;
+ }
+ /*
+ * References:
+ * [1] R. Tarjan "Depth-First Search and Linear Graph Algorithms"
+ * [2] D. J. Pearce "A Space-Efficient Algorithm for Finding Strongly Connected Components"
+ *
+ * The algorithm maintains the following invariant:
+ * - suppose there is a path 'u' ~> 'v', such that 'pre[v] < pre[u]';
+ * - then, vertex 'u' remains on stack while vertex 'v' is on stack.
+ *
+ * Consequently:
+ * - If 'low[v] < pre[v]', there is a path from 'v' to some vertex 'u',
+ * such that 'pre[u] == low[v]'; vertex 'u' is currently on the stack,
+ * and thus there is an SCC (loop) containing both 'u' and 'v'.
+ * - If 'low[v] == pre[v]', loops containing 'v' have been explored,
+ * and 'v' can be considered the root of some SCC.
+ *
+ * Here is a pseudo-code for an explicitly recursive version of the algorithm:
+ *
+ * NOT_ON_STACK = insn_cnt + 1
+ * pre = [0] * insn_cnt
+ * low = [0] * insn_cnt
+ * scc = [0] * insn_cnt
+ * stack = []
+ *
+ * next_preorder_num = 1
+ * next_scc_id = 1
+ *
+ * def recur(w):
+ * nonlocal next_preorder_num
+ * nonlocal next_scc_id
+ *
+ * pre[w] = next_preorder_num
+ * low[w] = next_preorder_num
+ * next_preorder_num += 1
+ * stack.append(w)
+ * for s in successors(w):
+ * # Note: for classic algorithm the block below should look as:
+ * #
+ * # if pre[s] == 0:
+ * # recur(s)
+ * # low[w] = min(low[w], low[s])
+ * # elif low[s] != NOT_ON_STACK:
+ * # low[w] = min(low[w], pre[s])
+ * #
+ * # But replacing both 'min' instructions with 'low[w] = min(low[w], low[s])'
+ * # does not break the invariant and makes iterative version of the algorithm
+ * # simpler. See 'Algorithm #3' from [2].
+ *
+ * # 's' not yet visited
+ * if pre[s] == 0:
+ * recur(s)
+ * # if 's' is on stack, pick lowest reachable preorder number from it;
+ * # if 's' is not on stack 'low[s] == NOT_ON_STACK > low[w]',
+ * # so 'min' would be a noop.
+ * low[w] = min(low[w], low[s])
+ *
+ * if low[w] == pre[w]:
+ * # 'w' is the root of an SCC, pop all vertices
+ * # below 'w' on stack and assign same SCC to them.
+ * while True:
+ * t = stack.pop()
+ * low[t] = NOT_ON_STACK
+ * scc[t] = next_scc_id
+ * if t == w:
+ * break
+ * next_scc_id += 1
+ *
+ * for i in range(0, insn_cnt):
+ * if pre[i] == 0:
+ * recur(i)
+ *
+ * Below implementation replaces explicit recursion with array 'dfs'.
+ */
+ for (i = 0; i < insn_cnt; i++) {
+ if (pre[i])
+ continue;
+ stack_sz = 0;
+ dfs_sz = 1;
+ dfs[0] = i;
+dfs_continue:
+ while (dfs_sz) {
+ w = dfs[dfs_sz - 1];
+ if (pre[w] == 0) {
+ low[w] = next_preorder_num;
+ pre[w] = next_preorder_num;
+ next_preorder_num++;
+ stack[stack_sz++] = w;
+ }
+ /* Visit 'w' successors */
+ succ = bpf_insn_successors(env, w);
+ for (j = 0; j < succ->cnt; ++j) {
+ if (pre[succ->items[j]]) {
+ low[w] = min(low[w], low[succ->items[j]]);
+ } else {
+ dfs[dfs_sz++] = succ->items[j];
+ goto dfs_continue;
+ }
+ }
+ /*
+ * Preserve the invariant: if some vertex above in the stack
+ * is reachable from 'w', keep 'w' on the stack.
+ */
+ if (low[w] < pre[w]) {
+ dfs_sz--;
+ goto dfs_continue;
+ }
+ /*
+ * Assign SCC number only if component has two or more elements,
+ * or if component has a self reference, or if instruction is a
+ * callback calling function (implicit loop).
+ */
+ assign_scc = stack[stack_sz - 1] != w; /* two or more elements? */
+ for (j = 0; j < succ->cnt; ++j) { /* self reference? */
+ if (succ->items[j] == w) {
+ assign_scc = true;
+ break;
+ }
+ }
+ if (bpf_calls_callback(env, w)) /* implicit loop? */
+ assign_scc = true;
+ /* Pop component elements from stack */
+ do {
+ t = stack[--stack_sz];
+ low[t] = NOT_ON_STACK;
+ if (assign_scc)
+ aux[t].scc = next_scc_id;
+ } while (t != w);
+ if (assign_scc)
+ next_scc_id++;
+ dfs_sz--;
+ }
+ }
+ env->scc_info = kvzalloc_objs(*env->scc_info, next_scc_id,
+ GFP_KERNEL_ACCOUNT);
+ if (!env->scc_info) {
+ err = -ENOMEM;
+ goto exit;
+ }
+ env->scc_cnt = next_scc_id;
+exit:
+ kvfree(stack);
+ kvfree(pre);
+ kvfree(low);
+ kvfree(dfs);
+ return err;
+}
diff --git a/kernel/bpf/check_btf.c b/kernel/bpf/check_btf.c
new file mode 100644
index 000000000000..93bebe6fe12e
--- /dev/null
+++ b/kernel/bpf/check_btf.c
@@ -0,0 +1,463 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/filter.h>
+#include <linux/btf.h>
+
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
+
+static int check_abnormal_return(struct bpf_verifier_env *env)
+{
+ int i;
+
+ for (i = 1; i < env->subprog_cnt; i++) {
+ if (env->subprog_info[i].has_ld_abs) {
+ verbose(env, "LD_ABS is not allowed in subprogs without BTF\n");
+ return -EINVAL;
+ }
+ if (env->subprog_info[i].has_tail_call) {
+ verbose(env, "tail_call is not allowed in subprogs without BTF\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/* The minimum supported BTF func info size */
+#define MIN_BPF_FUNCINFO_SIZE 8
+#define MAX_FUNCINFO_REC_SIZE 252
+
+static int check_btf_func_early(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ u32 krec_size = sizeof(struct bpf_func_info);
+ const struct btf_type *type, *func_proto;
+ u32 i, nfuncs, urec_size, min_size;
+ struct bpf_func_info *krecord;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ u32 prev_offset = 0;
+ bpfptr_t urecord;
+ int ret = -ENOMEM;
+
+ nfuncs = attr->func_info_cnt;
+ if (!nfuncs) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+
+ urec_size = attr->func_info_rec_size;
+ if (urec_size < MIN_BPF_FUNCINFO_SIZE ||
+ urec_size > MAX_FUNCINFO_REC_SIZE ||
+ urec_size % sizeof(u32)) {
+ verbose(env, "invalid func info rec size %u\n", urec_size);
+ return -EINVAL;
+ }
+
+ prog = env->prog;
+ btf = prog->aux->btf;
+
+ urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
+ min_size = min_t(u32, krec_size, urec_size);
+
+ krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (!krecord)
+ return -ENOMEM;
+
+ for (i = 0; i < nfuncs; i++) {
+ ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
+ if (ret) {
+ if (ret == -E2BIG) {
+ verbose(env, "nonzero tailing record in func info");
+ /* set the size kernel expects so loader can zero
+ * out the rest of the record.
+ */
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, func_info_rec_size),
+ &min_size, sizeof(min_size)))
+ ret = -EFAULT;
+ }
+ goto err_free;
+ }
+
+ if (copy_from_bpfptr(&krecord[i], urecord, min_size)) {
+ ret = -EFAULT;
+ goto err_free;
+ }
+
+ /* check insn_off */
+ ret = -EINVAL;
+ if (i == 0) {
+ if (krecord[i].insn_off) {
+ verbose(env,
+ "nonzero insn_off %u for the first func info record",
+ krecord[i].insn_off);
+ goto err_free;
+ }
+ } else if (krecord[i].insn_off <= prev_offset) {
+ verbose(env,
+ "same or smaller insn offset (%u) than previous func info record (%u)",
+ krecord[i].insn_off, prev_offset);
+ goto err_free;
+ }
+
+ /* check type_id */
+ type = btf_type_by_id(btf, krecord[i].type_id);
+ if (!type || !btf_type_is_func(type)) {
+ verbose(env, "invalid type id %d in func info",
+ krecord[i].type_id);
+ goto err_free;
+ }
+
+ func_proto = btf_type_by_id(btf, type->type);
+ if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto)))
+ /* btf_func_check() already verified it during BTF load */
+ goto err_free;
+
+ prev_offset = krecord[i].insn_off;
+ bpfptr_add(&urecord, urec_size);
+ }
+
+ prog->aux->func_info = krecord;
+ prog->aux->func_info_cnt = nfuncs;
+ return 0;
+
+err_free:
+ kvfree(krecord);
+ return ret;
+}
+
+static int check_btf_func(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ const struct btf_type *type, *func_proto, *ret_type;
+ u32 i, nfuncs, urec_size;
+ struct bpf_func_info *krecord;
+ struct bpf_func_info_aux *info_aux = NULL;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ bpfptr_t urecord;
+ bool scalar_return;
+ int ret = -ENOMEM;
+
+ nfuncs = attr->func_info_cnt;
+ if (!nfuncs) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+ if (nfuncs != env->subprog_cnt) {
+ verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
+ return -EINVAL;
+ }
+
+ urec_size = attr->func_info_rec_size;
+
+ prog = env->prog;
+ btf = prog->aux->btf;
+
+ urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
+
+ krecord = prog->aux->func_info;
+ info_aux = kzalloc_objs(*info_aux, nfuncs,
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (!info_aux)
+ return -ENOMEM;
+
+ for (i = 0; i < nfuncs; i++) {
+ /* check insn_off */
+ ret = -EINVAL;
+
+ if (env->subprog_info[i].start != krecord[i].insn_off) {
+ verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
+ goto err_free;
+ }
+
+ /* Already checked type_id */
+ type = btf_type_by_id(btf, krecord[i].type_id);
+ info_aux[i].linkage = BTF_INFO_VLEN(type->info);
+ /* Already checked func_proto */
+ func_proto = btf_type_by_id(btf, type->type);
+
+ ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
+ scalar_return =
+ btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type);
+ if (i && !scalar_return && env->subprog_info[i].has_ld_abs) {
+ verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n");
+ goto err_free;
+ }
+ if (i && !scalar_return && env->subprog_info[i].has_tail_call) {
+ verbose(env, "tail_call is only allowed in functions that return 'int'.\n");
+ goto err_free;
+ }
+
+ env->subprog_info[i].name = btf_name_by_offset(btf, type->name_off);
+ bpfptr_add(&urecord, urec_size);
+ }
+
+ prog->aux->func_info_aux = info_aux;
+ return 0;
+
+err_free:
+ kfree(info_aux);
+ return ret;
+}
+
+#define MIN_BPF_LINEINFO_SIZE offsetofend(struct bpf_line_info, line_col)
+#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE
+
+static int check_btf_line(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
+ struct bpf_subprog_info *sub;
+ struct bpf_line_info *linfo;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ bpfptr_t ulinfo;
+ int err;
+
+ nr_linfo = attr->line_info_cnt;
+ if (!nr_linfo)
+ return 0;
+ if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info))
+ return -EINVAL;
+
+ rec_size = attr->line_info_rec_size;
+ if (rec_size < MIN_BPF_LINEINFO_SIZE ||
+ rec_size > MAX_LINEINFO_REC_SIZE ||
+ rec_size & (sizeof(u32) - 1))
+ return -EINVAL;
+
+ /* Need to zero it in case the userspace may
+ * pass in a smaller bpf_line_info object.
+ */
+ linfo = kvzalloc_objs(struct bpf_line_info, nr_linfo,
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (!linfo)
+ return -ENOMEM;
+
+ prog = env->prog;
+ btf = prog->aux->btf;
+
+ s = 0;
+ sub = env->subprog_info;
+ ulinfo = make_bpfptr(attr->line_info, uattr.is_kernel);
+ expected_size = sizeof(struct bpf_line_info);
+ ncopy = min_t(u32, expected_size, rec_size);
+ for (i = 0; i < nr_linfo; i++) {
+ err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size);
+ if (err) {
+ if (err == -E2BIG) {
+ verbose(env, "nonzero tailing record in line_info");
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, line_info_rec_size),
+ &expected_size, sizeof(expected_size)))
+ err = -EFAULT;
+ }
+ goto err_free;
+ }
+
+ if (copy_from_bpfptr(&linfo[i], ulinfo, ncopy)) {
+ err = -EFAULT;
+ goto err_free;
+ }
+
+ /*
+ * Check insn_off to ensure
+ * 1) strictly increasing AND
+ * 2) bounded by prog->len
+ *
+ * The linfo[0].insn_off == 0 check logically falls into
+ * the later "missing bpf_line_info for func..." case
+ * because the first linfo[0].insn_off must be the
+ * first sub also and the first sub must have
+ * subprog_info[0].start == 0.
+ */
+ if ((i && linfo[i].insn_off <= prev_offset) ||
+ linfo[i].insn_off >= prog->len) {
+ verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n",
+ i, linfo[i].insn_off, prev_offset,
+ prog->len);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (!prog->insnsi[linfo[i].insn_off].code) {
+ verbose(env,
+ "Invalid insn code at line_info[%u].insn_off\n",
+ i);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (!btf_name_by_offset(btf, linfo[i].line_off) ||
+ !btf_name_by_offset(btf, linfo[i].file_name_off)) {
+ verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (s != env->subprog_cnt) {
+ if (linfo[i].insn_off == sub[s].start) {
+ sub[s].linfo_idx = i;
+ s++;
+ } else if (sub[s].start < linfo[i].insn_off) {
+ verbose(env, "missing bpf_line_info for func#%u\n", s);
+ err = -EINVAL;
+ goto err_free;
+ }
+ }
+
+ prev_offset = linfo[i].insn_off;
+ bpfptr_add(&ulinfo, rec_size);
+ }
+
+ if (s != env->subprog_cnt) {
+ verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n",
+ env->subprog_cnt - s, s);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ prog->aux->linfo = linfo;
+ prog->aux->nr_linfo = nr_linfo;
+
+ return 0;
+
+err_free:
+ kvfree(linfo);
+ return err;
+}
+
+#define MIN_CORE_RELO_SIZE sizeof(struct bpf_core_relo)
+#define MAX_CORE_RELO_SIZE MAX_FUNCINFO_REC_SIZE
+
+static int check_core_relo(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ u32 i, nr_core_relo, ncopy, expected_size, rec_size;
+ struct bpf_core_relo core_relo = {};
+ struct bpf_prog *prog = env->prog;
+ const struct btf *btf = prog->aux->btf;
+ struct bpf_core_ctx ctx = {
+ .log = &env->log,
+ .btf = btf,
+ };
+ bpfptr_t u_core_relo;
+ int err;
+
+ nr_core_relo = attr->core_relo_cnt;
+ if (!nr_core_relo)
+ return 0;
+ if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo))
+ return -EINVAL;
+
+ rec_size = attr->core_relo_rec_size;
+ if (rec_size < MIN_CORE_RELO_SIZE ||
+ rec_size > MAX_CORE_RELO_SIZE ||
+ rec_size % sizeof(u32))
+ return -EINVAL;
+
+ u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel);
+ expected_size = sizeof(struct bpf_core_relo);
+ ncopy = min_t(u32, expected_size, rec_size);
+
+ /* Unlike func_info and line_info, copy and apply each CO-RE
+ * relocation record one at a time.
+ */
+ for (i = 0; i < nr_core_relo; i++) {
+ /* future proofing when sizeof(bpf_core_relo) changes */
+ err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size);
+ if (err) {
+ if (err == -E2BIG) {
+ verbose(env, "nonzero tailing record in core_relo");
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, core_relo_rec_size),
+ &expected_size, sizeof(expected_size)))
+ err = -EFAULT;
+ }
+ break;
+ }
+
+ if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) {
+ verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n",
+ i, core_relo.insn_off, prog->len);
+ err = -EINVAL;
+ break;
+ }
+
+ err = bpf_core_apply(&ctx, &core_relo, i,
+ &prog->insnsi[core_relo.insn_off / 8]);
+ if (err)
+ break;
+ bpfptr_add(&u_core_relo, rec_size);
+ }
+ return err;
+}
+
+int bpf_check_btf_info_early(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ struct btf *btf;
+ int err;
+
+ if (!attr->func_info_cnt && !attr->line_info_cnt) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+
+ btf = btf_get_by_fd(attr->prog_btf_fd);
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+ if (btf_is_kernel(btf)) {
+ btf_put(btf);
+ return -EACCES;
+ }
+ env->prog->aux->btf = btf;
+
+ err = check_btf_func_early(env, attr, uattr);
+ if (err)
+ return err;
+ return 0;
+}
+
+int bpf_check_btf_info(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ int err;
+
+ if (!attr->func_info_cnt && !attr->line_info_cnt) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+
+ err = check_btf_func(env, attr, uattr);
+ if (err)
+ return err;
+
+ err = check_btf_line(env, attr, uattr);
+ if (err)
+ return err;
+
+ err = check_core_relo(env, attr, uattr);
+ if (err)
+ return err;
+
+ return 0;
+}
diff --git a/kernel/bpf/const_fold.c b/kernel/bpf/const_fold.c
new file mode 100644
index 000000000000..db73c4740b1e
--- /dev/null
+++ b/kernel/bpf/const_fold.c
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf_verifier.h>
+
+/*
+ * Forward dataflow analysis to determine constant register values at every
+ * instruction. Tracks 64-bit constant values in R0-R9 through the program,
+ * using a fixed-point iteration in reverse postorder. Records which registers
+ * hold known constants and their values in
+ * env->insn_aux_data[].{const_reg_mask, const_reg_vals}.
+ */
+
+enum const_arg_state {
+ CONST_ARG_UNVISITED, /* instruction not yet reached */
+ CONST_ARG_UNKNOWN, /* register value not a known constant */
+ CONST_ARG_CONST, /* register holds a known 64-bit constant */
+ CONST_ARG_MAP_PTR, /* register holds a map pointer, map_index is set */
+ CONST_ARG_MAP_VALUE, /* register points to map value data, val is offset */
+ CONST_ARG_SUBPROG, /* register holds a subprog pointer, val is subprog number */
+};
+
+struct const_arg_info {
+ enum const_arg_state state;
+ u32 map_index;
+ u64 val;
+};
+
+static bool ci_is_unvisited(const struct const_arg_info *ci)
+{
+ return ci->state == CONST_ARG_UNVISITED;
+}
+
+static bool ci_is_unknown(const struct const_arg_info *ci)
+{
+ return ci->state == CONST_ARG_UNKNOWN;
+}
+
+static bool ci_is_const(const struct const_arg_info *ci)
+{
+ return ci->state == CONST_ARG_CONST;
+}
+
+static bool ci_is_map_value(const struct const_arg_info *ci)
+{
+ return ci->state == CONST_ARG_MAP_VALUE;
+}
+
+/* Transfer function: compute output register state from instruction. */
+static void const_reg_xfer(struct bpf_verifier_env *env, struct const_arg_info *ci_out,
+ struct bpf_insn *insn, struct bpf_insn *insns, int idx)
+{
+ struct const_arg_info unknown = { .state = CONST_ARG_UNKNOWN, .val = 0 };
+ struct const_arg_info *dst = &ci_out[insn->dst_reg];
+ struct const_arg_info *src = &ci_out[insn->src_reg];
+ u8 class = BPF_CLASS(insn->code);
+ u8 mode = BPF_MODE(insn->code);
+ u8 opcode = BPF_OP(insn->code) | BPF_SRC(insn->code);
+ int r;
+
+ switch (class) {
+ case BPF_ALU:
+ case BPF_ALU64:
+ switch (opcode) {
+ case BPF_MOV | BPF_K:
+ dst->state = CONST_ARG_CONST;
+ dst->val = (s64)insn->imm;
+ break;
+ case BPF_MOV | BPF_X:
+ *dst = *src;
+ if (!insn->off)
+ break;
+ if (!ci_is_const(dst)) {
+ *dst = unknown;
+ break;
+ }
+ switch (insn->off) {
+ case 8: dst->val = (s8)dst->val; break;
+ case 16: dst->val = (s16)dst->val; break;
+ case 32: dst->val = (s32)dst->val; break;
+ default: *dst = unknown; break;
+ }
+ break;
+ case BPF_ADD | BPF_K:
+ if (!ci_is_const(dst) && !ci_is_map_value(dst)) {
+ *dst = unknown;
+ break;
+ }
+ dst->val += insn->imm;
+ break;
+ case BPF_SUB | BPF_K:
+ if (!ci_is_const(dst) && !ci_is_map_value(dst)) {
+ *dst = unknown;
+ break;
+ }
+ dst->val -= insn->imm;
+ break;
+ case BPF_AND | BPF_K:
+ if (!ci_is_const(dst)) {
+ if (!insn->imm) {
+ dst->state = CONST_ARG_CONST;
+ dst->val = 0;
+ } else {
+ *dst = unknown;
+ }
+ break;
+ }
+ dst->val &= (s64)insn->imm;
+ break;
+ case BPF_AND | BPF_X:
+ if (ci_is_const(dst) && dst->val == 0)
+ break; /* 0 & x == 0 */
+ if (ci_is_const(src) && src->val == 0) {
+ dst->state = CONST_ARG_CONST;
+ dst->val = 0;
+ break;
+ }
+ if (!ci_is_const(dst) || !ci_is_const(src)) {
+ *dst = unknown;
+ break;
+ }
+ dst->val &= src->val;
+ break;
+ default:
+ *dst = unknown;
+ break;
+ }
+ if (class == BPF_ALU) {
+ if (ci_is_const(dst))
+ dst->val = (u32)dst->val;
+ else if (!ci_is_unknown(dst))
+ *dst = unknown;
+ }
+ break;
+ case BPF_LD:
+ if (mode == BPF_ABS || mode == BPF_IND)
+ goto process_call;
+ if (mode != BPF_IMM || BPF_SIZE(insn->code) != BPF_DW)
+ break;
+ if (insn->src_reg == BPF_PSEUDO_FUNC) {
+ int subprog = bpf_find_subprog(env, idx + insn->imm + 1);
+
+ if (subprog >= 0) {
+ dst->state = CONST_ARG_SUBPROG;
+ dst->val = subprog;
+ } else {
+ *dst = unknown;
+ }
+ } else if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
+ insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
+ dst->state = CONST_ARG_MAP_VALUE;
+ dst->map_index = env->insn_aux_data[idx].map_index;
+ dst->val = env->insn_aux_data[idx].map_off;
+ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
+ insn->src_reg == BPF_PSEUDO_MAP_IDX) {
+ dst->state = CONST_ARG_MAP_PTR;
+ dst->map_index = env->insn_aux_data[idx].map_index;
+ } else if (insn->src_reg == 0) {
+ dst->state = CONST_ARG_CONST;
+ dst->val = (u64)(u32)insn->imm | ((u64)(u32)insns[idx + 1].imm << 32);
+ } else {
+ *dst = unknown;
+ }
+ break;
+ case BPF_LDX:
+ if (!ci_is_map_value(src)) {
+ *dst = unknown;
+ break;
+ }
+ struct bpf_map *map = env->used_maps[src->map_index];
+ int size = bpf_size_to_bytes(BPF_SIZE(insn->code));
+ bool is_ldsx = mode == BPF_MEMSX;
+ int off = src->val + insn->off;
+ u64 val = 0;
+
+ if (!bpf_map_is_rdonly(map) || !map->ops->map_direct_value_addr ||
+ map->map_type == BPF_MAP_TYPE_INSN_ARRAY ||
+ off < 0 || off + size > map->value_size ||
+ bpf_map_direct_read(map, off, size, &val, is_ldsx)) {
+ *dst = unknown;
+ break;
+ }
+ dst->state = CONST_ARG_CONST;
+ dst->val = val;
+ break;
+ case BPF_JMP:
+ if (opcode != BPF_CALL)
+ break;
+process_call:
+ for (r = BPF_REG_0; r <= BPF_REG_5; r++)
+ ci_out[r] = unknown;
+ break;
+ case BPF_STX:
+ if (mode != BPF_ATOMIC)
+ break;
+ if (insn->imm == BPF_CMPXCHG)
+ ci_out[BPF_REG_0] = unknown;
+ else if (insn->imm == BPF_LOAD_ACQ)
+ *dst = unknown;
+ else if (insn->imm & BPF_FETCH)
+ *src = unknown;
+ break;
+ }
+}
+
+/* Join function: merge output state into a successor's input state. */
+static bool const_reg_join(struct const_arg_info *ci_target,
+ struct const_arg_info *ci_out)
+{
+ bool changed = false;
+ int r;
+
+ for (r = 0; r < MAX_BPF_REG; r++) {
+ struct const_arg_info *old = &ci_target[r];
+ struct const_arg_info *new = &ci_out[r];
+
+ if (ci_is_unvisited(old) && !ci_is_unvisited(new)) {
+ ci_target[r] = *new;
+ changed = true;
+ } else if (!ci_is_unknown(old) && !ci_is_unvisited(old) &&
+ (new->state != old->state || new->val != old->val ||
+ new->map_index != old->map_index)) {
+ old->state = CONST_ARG_UNKNOWN;
+ changed = true;
+ }
+ }
+ return changed;
+}
+
+int bpf_compute_const_regs(struct bpf_verifier_env *env)
+{
+ struct const_arg_info unknown = { .state = CONST_ARG_UNKNOWN, .val = 0 };
+ struct bpf_insn_aux_data *insn_aux = env->insn_aux_data;
+ struct bpf_insn *insns = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ struct const_arg_info (*ci_in)[MAX_BPF_REG];
+ struct const_arg_info ci_out[MAX_BPF_REG];
+ struct bpf_iarray *succ;
+ bool changed;
+ int i, r;
+
+ /* kvzalloc zeroes memory, so all entries start as CONST_ARG_UNVISITED (0) */
+ ci_in = kvzalloc_objs(*ci_in, insn_cnt, GFP_KERNEL_ACCOUNT);
+ if (!ci_in)
+ return -ENOMEM;
+
+ /* Subprogram entries (including main at subprog 0): all registers unknown */
+ for (i = 0; i < env->subprog_cnt; i++) {
+ int start = env->subprog_info[i].start;
+
+ for (r = 0; r < MAX_BPF_REG; r++)
+ ci_in[start][r] = unknown;
+ }
+
+redo:
+ changed = false;
+ for (i = env->cfg.cur_postorder - 1; i >= 0; i--) {
+ int idx = env->cfg.insn_postorder[i];
+ struct bpf_insn *insn = &insns[idx];
+ struct const_arg_info *ci = ci_in[idx];
+
+ memcpy(ci_out, ci, sizeof(ci_out));
+
+ const_reg_xfer(env, ci_out, insn, insns, idx);
+
+ succ = bpf_insn_successors(env, idx);
+ for (int s = 0; s < succ->cnt; s++)
+ changed |= const_reg_join(ci_in[succ->items[s]], ci_out);
+ }
+ if (changed)
+ goto redo;
+
+ /* Save computed constants into insn_aux[] if they fit into 32-bit */
+ for (i = 0; i < insn_cnt; i++) {
+ u16 mask = 0, map_mask = 0, subprog_mask = 0;
+ struct bpf_insn_aux_data *aux = &insn_aux[i];
+ struct const_arg_info *ci = ci_in[i];
+
+ for (r = BPF_REG_0; r < ARRAY_SIZE(aux->const_reg_vals); r++) {
+ struct const_arg_info *c = &ci[r];
+
+ switch (c->state) {
+ case CONST_ARG_CONST: {
+ u64 val = c->val;
+
+ if (val != (u32)val)
+ break;
+ mask |= BIT(r);
+ aux->const_reg_vals[r] = val;
+ break;
+ }
+ case CONST_ARG_MAP_PTR:
+ map_mask |= BIT(r);
+ aux->const_reg_vals[r] = c->map_index;
+ break;
+ case CONST_ARG_SUBPROG:
+ subprog_mask |= BIT(r);
+ aux->const_reg_vals[r] = c->val;
+ break;
+ default:
+ break;
+ }
+ }
+ aux->const_reg_mask = mask;
+ aux->const_reg_map_mask = map_mask;
+ aux->const_reg_subprog_mask = subprog_mask;
+ }
+
+ kvfree(ci_in);
+ return 0;
+}
+
+static int eval_const_branch(u8 opcode, u64 dst_val, u64 src_val)
+{
+ switch (BPF_OP(opcode)) {
+ case BPF_JEQ: return dst_val == src_val;
+ case BPF_JNE: return dst_val != src_val;
+ case BPF_JGT: return dst_val > src_val;
+ case BPF_JGE: return dst_val >= src_val;
+ case BPF_JLT: return dst_val < src_val;
+ case BPF_JLE: return dst_val <= src_val;
+ case BPF_JSGT: return (s64)dst_val > (s64)src_val;
+ case BPF_JSGE: return (s64)dst_val >= (s64)src_val;
+ case BPF_JSLT: return (s64)dst_val < (s64)src_val;
+ case BPF_JSLE: return (s64)dst_val <= (s64)src_val;
+ case BPF_JSET: return (bool)(dst_val & src_val);
+ default: return -1;
+ }
+}
+
+/*
+ * Rewrite conditional branches with constant outcomes into unconditional
+ * jumps using register values resolved by bpf_compute_const_regs() pass.
+ * This eliminates dead edges from the CFG so that compute_live_registers()
+ * doesn't propagate liveness through dead code.
+ */
+int bpf_prune_dead_branches(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *insn_aux = env->insn_aux_data;
+ struct bpf_insn *insns = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ bool changed = false;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++) {
+ struct bpf_insn_aux_data *aux = &insn_aux[i];
+ struct bpf_insn *insn = &insns[i];
+ u8 class = BPF_CLASS(insn->code);
+ u64 dst_val, src_val;
+ int taken;
+
+ if (!bpf_insn_is_cond_jump(insn->code))
+ continue;
+ if (bpf_is_may_goto_insn(insn))
+ continue;
+
+ if (!(aux->const_reg_mask & BIT(insn->dst_reg)))
+ continue;
+ dst_val = aux->const_reg_vals[insn->dst_reg];
+
+ if (BPF_SRC(insn->code) == BPF_K) {
+ src_val = insn->imm;
+ } else {
+ if (!(aux->const_reg_mask & BIT(insn->src_reg)))
+ continue;
+ src_val = aux->const_reg_vals[insn->src_reg];
+ }
+
+ if (class == BPF_JMP32) {
+ /*
+ * The (s32) cast maps the 32-bit range into two u64 sub-ranges:
+ * [0x00000000, 0x7FFFFFFF] -> [0x0000000000000000, 0x000000007FFFFFFF]
+ * [0x80000000, 0xFFFFFFFF] -> [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF]
+ * The ordering is preserved within each sub-range, and
+ * the second sub-range is above the first as u64.
+ */
+ dst_val = (s32)dst_val;
+ src_val = (s32)src_val;
+ }
+
+ taken = eval_const_branch(insn->code, dst_val, src_val);
+ if (taken < 0) {
+ bpf_log(&env->log, "Unknown conditional jump %x\n", insn->code);
+ return -EFAULT;
+ }
+ *insn = BPF_JMP_A(taken ? insn->off : 0);
+ changed = true;
+ }
+
+ if (!changed)
+ return 0;
+ /* recompute postorder, since CFG has changed */
+ kvfree(env->cfg.insn_postorder);
+ env->cfg.insn_postorder = NULL;
+ return bpf_compute_postorder(env);
+}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7b675a451ec8..066b86e7233c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -18,7 +18,6 @@
*/
#include <uapi/linux/btf.h>
-#include <crypto/sha1.h>
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
@@ -1487,6 +1486,8 @@ void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
* know whether fp here is the clone or the original.
*/
fp->aux->prog = fp;
+ if (fp->aux->offload)
+ fp->aux->offload->prog = fp;
bpf_prog_clone_free(fp_other);
}
@@ -2087,12 +2088,12 @@ select_insn:
if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
goto out;
- tail_call_cnt++;
-
prog = READ_ONCE(array->ptrs[index]);
if (!prog)
goto out;
+ tail_call_cnt++;
+
/* ARG1 at this point is guaranteed to point to CTX from
* the verifier side due to the fact that the tail call is
* handled like a helper, that is, bpf_tail_call_proto,
@@ -2613,8 +2614,10 @@ static struct bpf_prog_dummy {
},
};
-struct bpf_empty_prog_array bpf_empty_prog_array = {
- .null_prog = NULL,
+struct bpf_prog_array bpf_empty_prog_array = {
+ .items = {
+ { .prog = NULL },
+ },
};
EXPORT_SYMBOL(bpf_empty_prog_array);
@@ -2625,14 +2628,14 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
if (prog_cnt)
p = kzalloc_flex(*p, items, prog_cnt + 1, flags);
else
- p = &bpf_empty_prog_array.hdr;
+ p = &bpf_empty_prog_array;
return p;
}
void bpf_prog_array_free(struct bpf_prog_array *progs)
{
- if (!progs || progs == &bpf_empty_prog_array.hdr)
+ if (!progs || progs == &bpf_empty_prog_array)
return;
kfree_rcu(progs, rcu);
}
@@ -2641,19 +2644,17 @@ static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
{
struct bpf_prog_array *progs;
- /* If RCU Tasks Trace grace period implies RCU grace period, there is
- * no need to call kfree_rcu(), just call kfree() directly.
+ /*
+ * RCU Tasks Trace grace period implies RCU grace period, there is no
+ * need to call kfree_rcu(), just call kfree() directly.
*/
progs = container_of(rcu, struct bpf_prog_array, rcu);
- if (rcu_trace_implies_rcu_gp())
- kfree(progs);
- else
- kfree_rcu(progs, rcu);
+ kfree(progs);
}
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
{
- if (!progs || progs == &bpf_empty_prog_array.hdr)
+ if (!progs || progs == &bpf_empty_prog_array)
return;
call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
}
@@ -3314,6 +3315,63 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
#ifdef CONFIG_BPF_SYSCALL
+void bpf_get_linfo_file_line(struct btf *btf, const struct bpf_line_info *linfo,
+ const char **filep, const char **linep, int *nump)
+{
+ /* Get base component of the file path. */
+ if (filep) {
+ *filep = btf_name_by_offset(btf, linfo->file_name_off);
+ *filep = kbasename(*filep);
+ }
+
+ /* Obtain the source line, and strip whitespace in prefix. */
+ if (linep) {
+ *linep = btf_name_by_offset(btf, linfo->line_off);
+ while (isspace(**linep))
+ *linep += 1;
+ }
+
+ if (nump)
+ *nump = BPF_LINE_INFO_LINE_NUM(linfo->line_col);
+}
+
+const struct bpf_line_info *bpf_find_linfo(const struct bpf_prog *prog, u32 insn_off)
+{
+ const struct bpf_line_info *linfo;
+ u32 nr_linfo;
+ int l, r, m;
+
+ nr_linfo = prog->aux->nr_linfo;
+ if (!nr_linfo || insn_off >= prog->len)
+ return NULL;
+
+ linfo = prog->aux->linfo;
+ /* Loop invariant: linfo[l].insn_off <= insns_off.
+ * linfo[0].insn_off == 0 which always satisfies above condition.
+ * Binary search is searching for rightmost linfo entry that satisfies
+ * the above invariant, giving us the desired record that covers given
+ * instruction offset.
+ */
+ l = 0;
+ r = nr_linfo - 1;
+ while (l < r) {
+ /* (r - l + 1) / 2 means we break a tie to the right, so if:
+ * l=1, r=2, linfo[l].insn_off <= insn_off, linfo[r].insn_off > insn_off,
+ * then m=2, we see that linfo[m].insn_off > insn_off, and so
+ * r becomes 1 and we exit the loop with correct l==1.
+ * If the tie was broken to the left, m=1 would end us up in
+ * an endless loop where l and m stay at 1 and r stays at 2.
+ */
+ m = l + (r - l + 1) / 2;
+ if (linfo[m].insn_off <= insn_off)
+ l = m;
+ else
+ r = m - 1;
+ }
+
+ return &linfo[l];
+}
+
int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep,
const char **linep, int *nump)
{
@@ -3348,14 +3406,7 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char *
if (idx == -1)
return -ENOENT;
- /* Get base component of the file path. */
- *filep = btf_name_by_offset(btf, linfo[idx].file_name_off);
- *filep = kbasename(*filep);
- /* Obtain the source line, and strip whitespace in prefix. */
- *linep = btf_name_by_offset(btf, linfo[idx].line_off);
- while (isspace(**linep))
- *linep += 1;
- *nump = BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col);
+ bpf_get_linfo_file_line(btf, &linfo[idx], filep, linep, nump);
return 0;
}
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 32b43cb9061b..5e59ab896f05 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -223,7 +223,10 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
}
break;
default:
- bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
+ bpf_warn_invalid_xdp_action(xdpf->dev_rx, rcpu->prog, act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(xdpf->dev_rx, rcpu->prog, act);
fallthrough;
case XDP_DROP:
xdp_return_frame(xdpf);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 3d619d01088e..cc0a43ebab6b 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -665,7 +665,7 @@ int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
for (i = 0; i < dtab->n_buckets; i++) {
head = dev_map_index_hash(dtab, i);
hlist_for_each_entry_rcu(dst, head, index_hlist,
- lockdep_is_held(&dtab->index_lock)) {
+ rcu_read_lock_bh_held()) {
if (!is_valid_dst(dst, xdpf))
continue;
@@ -747,7 +747,6 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
struct bpf_dtab_netdev *dst, *last_dst = NULL;
int excluded_devices[1+MAX_NEST_DEV];
struct hlist_head *head;
- struct hlist_node *next;
int num_excluded = 0;
unsigned int i;
int err;
@@ -787,7 +786,7 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
} else { /* BPF_MAP_TYPE_DEVMAP_HASH */
for (i = 0; i < dtab->n_buckets; i++) {
head = dev_map_index_hash(dtab, i);
- hlist_for_each_entry_safe(dst, next, head, index_hlist) {
+ hlist_for_each_entry_rcu(dst, head, index_hlist, rcu_read_lock_bh_held()) {
if (is_ifindex_excluded(excluded_devices, num_excluded,
dst->dev->ifindex))
continue;
diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c
new file mode 100644
index 000000000000..67c9b28767e1
--- /dev/null
+++ b/kernel/bpf/fixups.c
@@ -0,0 +1,2457 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
+#include <linux/perf_event.h>
+#include <net/xdp.h>
+#include "disasm.h"
+
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
+
+static bool is_cmpxchg_insn(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_CMPXCHG;
+}
+
+/* Return the regno defined by the insn, or -1. */
+static int insn_def_regno(const struct bpf_insn *insn)
+{
+ switch (BPF_CLASS(insn->code)) {
+ case BPF_JMP:
+ case BPF_JMP32:
+ case BPF_ST:
+ return -1;
+ case BPF_STX:
+ if (BPF_MODE(insn->code) == BPF_ATOMIC ||
+ BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) {
+ if (insn->imm == BPF_CMPXCHG)
+ return BPF_REG_0;
+ else if (insn->imm == BPF_LOAD_ACQ)
+ return insn->dst_reg;
+ else if (insn->imm & BPF_FETCH)
+ return insn->src_reg;
+ }
+ return -1;
+ default:
+ return insn->dst_reg;
+ }
+}
+
+/* Return TRUE if INSN has defined any 32-bit value explicitly. */
+static bool insn_has_def32(struct bpf_insn *insn)
+{
+ int dst_reg = insn_def_regno(insn);
+
+ if (dst_reg == -1)
+ return false;
+
+ return !bpf_is_reg64(insn, dst_reg, NULL, DST_OP);
+}
+
+static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
+{
+ const struct bpf_kfunc_desc *d0 = a;
+ const struct bpf_kfunc_desc *d1 = b;
+
+ if (d0->imm != d1->imm)
+ return d0->imm < d1->imm ? -1 : 1;
+ if (d0->offset != d1->offset)
+ return d0->offset < d1->offset ? -1 : 1;
+ return 0;
+}
+
+const struct btf_func_model *
+bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_kfunc_desc desc = {
+ .imm = insn->imm,
+ .offset = insn->off,
+ };
+ const struct bpf_kfunc_desc *res;
+ struct bpf_kfunc_desc_tab *tab;
+
+ tab = prog->aux->kfunc_tab;
+ res = bsearch(&desc, tab->descs, tab->nr_descs,
+ sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off);
+
+ return res ? &res->func_model : NULL;
+}
+
+static int set_kfunc_desc_imm(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc)
+{
+ unsigned long call_imm;
+
+ if (bpf_jit_supports_far_kfunc_call()) {
+ call_imm = desc->func_id;
+ } else {
+ call_imm = BPF_CALL_IMM(desc->addr);
+ /* Check whether the relative offset overflows desc->imm */
+ if ((unsigned long)(s32)call_imm != call_imm) {
+ verbose(env, "address of kernel func_id %u is out of range\n",
+ desc->func_id);
+ return -EINVAL;
+ }
+ }
+ desc->imm = call_imm;
+ return 0;
+}
+
+static int sort_kfunc_descs_by_imm_off(struct bpf_verifier_env *env)
+{
+ struct bpf_kfunc_desc_tab *tab;
+ int i, err;
+
+ tab = env->prog->aux->kfunc_tab;
+ if (!tab)
+ return 0;
+
+ for (i = 0; i < tab->nr_descs; i++) {
+ err = set_kfunc_desc_imm(env, &tab->descs[i]);
+ if (err)
+ return err;
+ }
+
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_desc_cmp_by_imm_off, NULL);
+ return 0;
+}
+
+static int add_kfunc_in_insns(struct bpf_verifier_env *env,
+ struct bpf_insn *insn, int cnt)
+{
+ int i, ret;
+
+ for (i = 0; i < cnt; i++, insn++) {
+ if (bpf_pseudo_kfunc_call(insn)) {
+ ret = bpf_add_kfunc_call(env, insn->imm, insn->off);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+static int get_callee_stack_depth(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, int idx)
+{
+ int start = idx + insn->imm + 1, subprog;
+
+ subprog = bpf_find_subprog(env, start);
+ if (verifier_bug_if(subprog < 0, env, "get stack depth: no program at insn %d", start))
+ return -EFAULT;
+ return env->subprog_info[subprog].stack_depth;
+}
+#endif
+
+/* single env->prog->insni[off] instruction was replaced with the range
+ * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
+ * [0, off) and [off, end) to new locations, so the patched range stays zero
+ */
+static void adjust_insn_aux_data(struct bpf_verifier_env *env,
+ struct bpf_prog *new_prog, u32 off, u32 cnt)
+{
+ struct bpf_insn_aux_data *data = env->insn_aux_data;
+ struct bpf_insn *insn = new_prog->insnsi;
+ u32 old_seen = data[off].seen;
+ u32 prog_len;
+ int i;
+
+ /* aux info at OFF always needs adjustment, no matter fast path
+ * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the
+ * original insn at old prog.
+ */
+ data[off].zext_dst = insn_has_def32(insn + off + cnt - 1);
+
+ if (cnt == 1)
+ return;
+ prog_len = new_prog->len;
+
+ memmove(data + off + cnt - 1, data + off,
+ sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ memset(data + off, 0, sizeof(struct bpf_insn_aux_data) * (cnt - 1));
+ for (i = off; i < off + cnt - 1; i++) {
+ /* Expand insni[off]'s seen count to the patched range. */
+ data[i].seen = old_seen;
+ data[i].zext_dst = insn_has_def32(insn + i);
+ }
+}
+
+static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+ int i;
+
+ if (len == 1)
+ return;
+ /* NOTE: fake 'exit' subprog should be updated as well. */
+ for (i = 0; i <= env->subprog_cnt; i++) {
+ if (env->subprog_info[i].start <= off)
+ continue;
+ env->subprog_info[i].start += len - 1;
+ }
+}
+
+static void adjust_insn_arrays(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+ int i;
+
+ if (len == 1)
+ return;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++)
+ bpf_insn_array_adjust(env->insn_array_maps[i], off, len);
+}
+
+static void adjust_insn_arrays_after_remove(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+ int i;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++)
+ bpf_insn_array_adjust_after_remove(env->insn_array_maps[i], off, len);
+}
+
+static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len)
+{
+ struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
+ int i, sz = prog->aux->size_poke_tab;
+ struct bpf_jit_poke_descriptor *desc;
+
+ for (i = 0; i < sz; i++) {
+ desc = &tab[i];
+ if (desc->insn_idx <= off)
+ continue;
+ desc->insn_idx += len - 1;
+ }
+}
+
+static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
+ const struct bpf_insn *patch, u32 len)
+{
+ struct bpf_prog *new_prog;
+ struct bpf_insn_aux_data *new_data = NULL;
+
+ if (len > 1) {
+ new_data = vrealloc(env->insn_aux_data,
+ array_size(env->prog->len + len - 1,
+ sizeof(struct bpf_insn_aux_data)),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!new_data)
+ return NULL;
+
+ env->insn_aux_data = new_data;
+ }
+
+ new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
+ if (IS_ERR(new_prog)) {
+ if (PTR_ERR(new_prog) == -ERANGE)
+ verbose(env,
+ "insn %d cannot be patched due to 16-bit range\n",
+ env->insn_aux_data[off].orig_idx);
+ return NULL;
+ }
+ adjust_insn_aux_data(env, new_prog, off, len);
+ adjust_subprog_starts(env, off, len);
+ adjust_insn_arrays(env, off, len);
+ adjust_poke_descs(new_prog, off, len);
+ return new_prog;
+}
+
+/*
+ * For all jmp insns in a given 'prog' that point to 'tgt_idx' insn adjust the
+ * jump offset by 'delta'.
+ */
+static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta)
+{
+ struct bpf_insn *insn = prog->insnsi;
+ u32 insn_cnt = prog->len, i;
+ s32 imm;
+ s16 off;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ u8 code = insn->code;
+
+ if (tgt_idx <= i && i < tgt_idx + delta)
+ continue;
+
+ if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) ||
+ BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT)
+ continue;
+
+ if (insn->code == (BPF_JMP32 | BPF_JA)) {
+ if (i + 1 + insn->imm != tgt_idx)
+ continue;
+ if (check_add_overflow(insn->imm, delta, &imm))
+ return -ERANGE;
+ insn->imm = imm;
+ } else {
+ if (i + 1 + insn->off != tgt_idx)
+ continue;
+ if (check_add_overflow(insn->off, delta, &off))
+ return -ERANGE;
+ insn->off = off;
+ }
+ }
+ return 0;
+}
+
+static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
+ u32 off, u32 cnt)
+{
+ int i, j;
+
+ /* find first prog starting at or after off (first to remove) */
+ for (i = 0; i < env->subprog_cnt; i++)
+ if (env->subprog_info[i].start >= off)
+ break;
+ /* find first prog starting at or after off + cnt (first to stay) */
+ for (j = i; j < env->subprog_cnt; j++)
+ if (env->subprog_info[j].start >= off + cnt)
+ break;
+ /* if j doesn't start exactly at off + cnt, we are just removing
+ * the front of previous prog
+ */
+ if (env->subprog_info[j].start != off + cnt)
+ j--;
+
+ if (j > i) {
+ struct bpf_prog_aux *aux = env->prog->aux;
+ int move;
+
+ /* move fake 'exit' subprog as well */
+ move = env->subprog_cnt + 1 - j;
+
+ memmove(env->subprog_info + i,
+ env->subprog_info + j,
+ sizeof(*env->subprog_info) * move);
+ env->subprog_cnt -= j - i;
+
+ /* remove func_info */
+ if (aux->func_info) {
+ move = aux->func_info_cnt - j;
+
+ memmove(aux->func_info + i,
+ aux->func_info + j,
+ sizeof(*aux->func_info) * move);
+ aux->func_info_cnt -= j - i;
+ /* func_info->insn_off is set after all code rewrites,
+ * in adjust_btf_func() - no need to adjust
+ */
+ }
+ } else {
+ /* convert i from "first prog to remove" to "first to adjust" */
+ if (env->subprog_info[i].start == off)
+ i++;
+ }
+
+ /* update fake 'exit' subprog as well */
+ for (; i <= env->subprog_cnt; i++)
+ env->subprog_info[i].start -= cnt;
+
+ return 0;
+}
+
+static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off,
+ u32 cnt)
+{
+ struct bpf_prog *prog = env->prog;
+ u32 i, l_off, l_cnt, nr_linfo;
+ struct bpf_line_info *linfo;
+
+ nr_linfo = prog->aux->nr_linfo;
+ if (!nr_linfo)
+ return 0;
+
+ linfo = prog->aux->linfo;
+
+ /* find first line info to remove, count lines to be removed */
+ for (i = 0; i < nr_linfo; i++)
+ if (linfo[i].insn_off >= off)
+ break;
+
+ l_off = i;
+ l_cnt = 0;
+ for (; i < nr_linfo; i++)
+ if (linfo[i].insn_off < off + cnt)
+ l_cnt++;
+ else
+ break;
+
+ /* First live insn doesn't match first live linfo, it needs to "inherit"
+ * last removed linfo. prog is already modified, so prog->len == off
+ * means no live instructions after (tail of the program was removed).
+ */
+ if (prog->len != off && l_cnt &&
+ (i == nr_linfo || linfo[i].insn_off != off + cnt)) {
+ l_cnt--;
+ linfo[--i].insn_off = off + cnt;
+ }
+
+ /* remove the line info which refer to the removed instructions */
+ if (l_cnt) {
+ memmove(linfo + l_off, linfo + i,
+ sizeof(*linfo) * (nr_linfo - i));
+
+ prog->aux->nr_linfo -= l_cnt;
+ nr_linfo = prog->aux->nr_linfo;
+ }
+
+ /* pull all linfo[i].insn_off >= off + cnt in by cnt */
+ for (i = l_off; i < nr_linfo; i++)
+ linfo[i].insn_off -= cnt;
+
+ /* fix up all subprogs (incl. 'exit') which start >= off */
+ for (i = 0; i <= env->subprog_cnt; i++)
+ if (env->subprog_info[i].linfo_idx > l_off) {
+ /* program may have started in the removed region but
+ * may not be fully removed
+ */
+ if (env->subprog_info[i].linfo_idx >= l_off + l_cnt)
+ env->subprog_info[i].linfo_idx -= l_cnt;
+ else
+ env->subprog_info[i].linfo_idx = l_off;
+ }
+
+ return 0;
+}
+
+/*
+ * Clean up dynamically allocated fields of aux data for instructions [start, ...]
+ */
+void bpf_clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn *insns = env->prog->insnsi;
+ int end = start + len;
+ int i;
+
+ for (i = start; i < end; i++) {
+ if (aux_data[i].jt) {
+ kvfree(aux_data[i].jt);
+ aux_data[i].jt = NULL;
+ }
+
+ if (bpf_is_ldimm64(&insns[i]))
+ i++;
+ }
+}
+
+static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ unsigned int orig_prog_len = env->prog->len;
+ int err;
+
+ if (bpf_prog_is_offloaded(env->prog->aux))
+ bpf_prog_offload_remove_insns(env, off, cnt);
+
+ /* Should be called before bpf_remove_insns, as it uses prog->insnsi */
+ bpf_clear_insn_aux_data(env, off, cnt);
+
+ err = bpf_remove_insns(env->prog, off, cnt);
+ if (err)
+ return err;
+
+ err = adjust_subprog_starts_after_remove(env, off, cnt);
+ if (err)
+ return err;
+
+ err = bpf_adj_linfo_after_remove(env, off, cnt);
+ if (err)
+ return err;
+
+ adjust_insn_arrays_after_remove(env, off, cnt);
+
+ memmove(aux_data + off, aux_data + off + cnt,
+ sizeof(*aux_data) * (orig_prog_len - off - cnt));
+
+ return 0;
+}
+
+static const struct bpf_insn NOP = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+static const struct bpf_insn MAY_GOTO_0 = BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0, 0);
+
+bool bpf_insn_is_cond_jump(u8 code)
+{
+ u8 op;
+
+ op = BPF_OP(code);
+ if (BPF_CLASS(code) == BPF_JMP32)
+ return op != BPF_JA;
+
+ if (BPF_CLASS(code) != BPF_JMP)
+ return false;
+
+ return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
+}
+
+void bpf_opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (!bpf_insn_is_cond_jump(insn->code))
+ continue;
+
+ if (!aux_data[i + 1].seen)
+ ja.off = insn->off;
+ else if (!aux_data[i + 1 + insn->off].seen)
+ ja.off = 0;
+ else
+ continue;
+
+ if (bpf_prog_is_offloaded(env->prog->aux))
+ bpf_prog_offload_replace_insn(env, i, &ja);
+
+ memcpy(insn, &ja, sizeof(ja));
+ }
+}
+
+int bpf_opt_remove_dead_code(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ int insn_cnt = env->prog->len;
+ int i, err;
+
+ for (i = 0; i < insn_cnt; i++) {
+ int j;
+
+ j = 0;
+ while (i + j < insn_cnt && !aux_data[i + j].seen)
+ j++;
+ if (!j)
+ continue;
+
+ err = verifier_remove_insns(env, i, j);
+ if (err)
+ return err;
+ insn_cnt = env->prog->len;
+ }
+
+ return 0;
+}
+
+int bpf_opt_remove_nops(struct bpf_verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ bool is_may_goto_0, is_ja;
+ int i, err;
+
+ for (i = 0; i < insn_cnt; i++) {
+ is_may_goto_0 = !memcmp(&insn[i], &MAY_GOTO_0, sizeof(MAY_GOTO_0));
+ is_ja = !memcmp(&insn[i], &NOP, sizeof(NOP));
+
+ if (!is_may_goto_0 && !is_ja)
+ continue;
+
+ err = verifier_remove_insns(env, i, 1);
+ if (err)
+ return err;
+ insn_cnt--;
+ /* Go back one insn to catch may_goto +1; may_goto +0 sequence */
+ i -= (is_may_goto_0 && i > 0) ? 2 : 1;
+ }
+
+ return 0;
+}
+
+int bpf_opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
+ const union bpf_attr *attr)
+{
+ struct bpf_insn *patch;
+ /* use env->insn_buf as two independent buffers */
+ struct bpf_insn *zext_patch = env->insn_buf;
+ struct bpf_insn *rnd_hi32_patch = &env->insn_buf[2];
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ int i, patch_len, delta = 0, len = env->prog->len;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct bpf_prog *new_prog;
+ bool rnd_hi32;
+
+ rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32;
+ zext_patch[1] = BPF_ZEXT_REG(0);
+ rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0);
+ rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
+ rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX);
+ for (i = 0; i < len; i++) {
+ int adj_idx = i + delta;
+ struct bpf_insn insn;
+ int load_reg;
+
+ insn = insns[adj_idx];
+ load_reg = insn_def_regno(&insn);
+ if (!aux[adj_idx].zext_dst) {
+ u8 code, class;
+ u32 imm_rnd;
+
+ if (!rnd_hi32)
+ continue;
+
+ code = insn.code;
+ class = BPF_CLASS(code);
+ if (load_reg == -1)
+ continue;
+
+ /* NOTE: arg "reg" (the fourth one) is only used for
+ * BPF_STX + SRC_OP, so it is safe to pass NULL
+ * here.
+ */
+ if (bpf_is_reg64(&insn, load_reg, NULL, DST_OP)) {
+ if (class == BPF_LD &&
+ BPF_MODE(code) == BPF_IMM)
+ i++;
+ continue;
+ }
+
+ /* ctx load could be transformed into wider load. */
+ if (class == BPF_LDX &&
+ aux[adj_idx].ptr_type == PTR_TO_CTX)
+ continue;
+
+ imm_rnd = get_random_u32();
+ rnd_hi32_patch[0] = insn;
+ rnd_hi32_patch[1].imm = imm_rnd;
+ rnd_hi32_patch[3].dst_reg = load_reg;
+ patch = rnd_hi32_patch;
+ patch_len = 4;
+ goto apply_patch_buffer;
+ }
+
+ /* Add in an zero-extend instruction if a) the JIT has requested
+ * it or b) it's a CMPXCHG.
+ *
+ * The latter is because: BPF_CMPXCHG always loads a value into
+ * R0, therefore always zero-extends. However some archs'
+ * equivalent instruction only does this load when the
+ * comparison is successful. This detail of CMPXCHG is
+ * orthogonal to the general zero-extension behaviour of the
+ * CPU, so it's treated independently of bpf_jit_needs_zext.
+ */
+ if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))
+ continue;
+
+ /* Zero-extension is done by the caller. */
+ if (bpf_pseudo_kfunc_call(&insn))
+ continue;
+
+ if (verifier_bug_if(load_reg == -1, env,
+ "zext_dst is set, but no reg is defined"))
+ return -EFAULT;
+
+ zext_patch[0] = insn;
+ zext_patch[1].dst_reg = load_reg;
+ zext_patch[1].src_reg = load_reg;
+ patch = zext_patch;
+ patch_len = 2;
+apply_patch_buffer:
+ new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = new_prog;
+ insns = new_prog->insnsi;
+ aux = env->insn_aux_data;
+ delta += patch_len - 1;
+ }
+
+ return 0;
+}
+
+/* convert load instructions that access fields of a context type into a
+ * sequence of instructions that access fields of the underlying structure:
+ * struct __sk_buff -> struct sk_buff
+ * struct bpf_sock_ops -> struct sock
+ */
+int bpf_convert_ctx_accesses(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprogs = env->subprog_info;
+ const struct bpf_verifier_ops *ops = env->ops;
+ int i, cnt, size, ctx_field_size, ret, delta = 0, epilogue_cnt = 0;
+ const int insn_cnt = env->prog->len;
+ struct bpf_insn *epilogue_buf = env->epilogue_buf;
+ struct bpf_insn *insn_buf = env->insn_buf;
+ struct bpf_insn *insn;
+ u32 target_size, size_default, off;
+ struct bpf_prog *new_prog;
+ enum bpf_access_type type;
+ bool is_narrower_load;
+ int epilogue_idx = 0;
+
+ if (ops->gen_epilogue) {
+ epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog,
+ -(subprogs[0].stack_depth + 8));
+ if (epilogue_cnt >= INSN_BUF_SIZE) {
+ verifier_bug(env, "epilogue is too long");
+ return -EFAULT;
+ } else if (epilogue_cnt) {
+ /* Save the ARG_PTR_TO_CTX for the epilogue to use */
+ cnt = 0;
+ subprogs[0].stack_depth += 8;
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1,
+ -subprogs[0].stack_depth);
+ insn_buf[cnt++] = env->prog->insnsi[0];
+ new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = new_prog;
+ delta += cnt - 1;
+
+ ret = add_kfunc_in_insns(env, epilogue_buf, epilogue_cnt - 1);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ if (ops->gen_prologue || env->seen_direct_write) {
+ if (!ops->gen_prologue) {
+ verifier_bug(env, "gen_prologue is null");
+ return -EFAULT;
+ }
+ cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
+ env->prog);
+ if (cnt >= INSN_BUF_SIZE) {
+ verifier_bug(env, "prologue is too long");
+ return -EFAULT;
+ } else if (cnt) {
+ new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ env->prog = new_prog;
+ delta += cnt - 1;
+
+ ret = add_kfunc_in_insns(env, insn_buf, cnt - 1);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ if (delta)
+ WARN_ON(adjust_jmp_off(env->prog, 0, delta));
+
+ if (bpf_prog_is_offloaded(env->prog->aux))
+ return 0;
+
+ insn = env->prog->insnsi + delta;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ bpf_convert_ctx_access_t convert_ctx_access;
+ u8 mode;
+
+ if (env->insn_aux_data[i + delta].nospec) {
+ WARN_ON_ONCE(env->insn_aux_data[i + delta].alu_state);
+ struct bpf_insn *patch = insn_buf;
+
+ *patch++ = BPF_ST_NOSPEC();
+ *patch++ = *insn;
+ cnt = patch - insn_buf;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ /* This can not be easily merged with the
+ * nospec_result-case, because an insn may require a
+ * nospec before and after itself. Therefore also do not
+ * 'continue' here but potentially apply further
+ * patching to insn. *insn should equal patch[1] now.
+ */
+ }
+
+ if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_DW) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_B) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_H) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_W)) {
+ type = BPF_READ;
+ } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_DW) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
+ type = BPF_WRITE;
+ } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_B) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_H) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) &&
+ env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) {
+ insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code);
+ env->prog->aux->num_exentries++;
+ continue;
+ } else if (insn->code == (BPF_JMP | BPF_EXIT) &&
+ epilogue_cnt &&
+ i + delta < subprogs[1].start) {
+ /* Generate epilogue for the main prog */
+ if (epilogue_idx) {
+ /* jump back to the earlier generated epilogue */
+ insn_buf[0] = BPF_JMP32_A(epilogue_idx - i - delta - 1);
+ cnt = 1;
+ } else {
+ memcpy(insn_buf, epilogue_buf,
+ epilogue_cnt * sizeof(*epilogue_buf));
+ cnt = epilogue_cnt;
+ /* epilogue_idx cannot be 0. It must have at
+ * least one ctx ptr saving insn before the
+ * epilogue.
+ */
+ epilogue_idx = i + delta;
+ }
+ goto patch_insn_buf;
+ } else {
+ continue;
+ }
+
+ if (type == BPF_WRITE &&
+ env->insn_aux_data[i + delta].nospec_result) {
+ /* nospec_result is only used to mitigate Spectre v4 and
+ * to limit verification-time for Spectre v1.
+ */
+ struct bpf_insn *patch = insn_buf;
+
+ *patch++ = *insn;
+ *patch++ = BPF_ST_NOSPEC();
+ cnt = patch - insn_buf;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ switch ((int)env->insn_aux_data[i + delta].ptr_type) {
+ case PTR_TO_CTX:
+ if (!ops->convert_ctx_access)
+ continue;
+ convert_ctx_access = ops->convert_ctx_access;
+ break;
+ case PTR_TO_SOCKET:
+ case PTR_TO_SOCK_COMMON:
+ convert_ctx_access = bpf_sock_convert_ctx_access;
+ break;
+ case PTR_TO_TCP_SOCK:
+ convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
+ break;
+ case PTR_TO_XDP_SOCK:
+ convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
+ break;
+ case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | PTR_UNTRUSTED:
+ /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
+ * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
+ * be said once it is marked PTR_UNTRUSTED, hence we must handle
+ * any faults for loads into such types. BPF_WRITE is disallowed
+ * for this case.
+ */
+ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
+ case PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED:
+ if (type == BPF_READ) {
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ insn->code = BPF_LDX | BPF_PROBE_MEM |
+ BPF_SIZE((insn)->code);
+ else
+ insn->code = BPF_LDX | BPF_PROBE_MEMSX |
+ BPF_SIZE((insn)->code);
+ env->prog->aux->num_exentries++;
+ }
+ continue;
+ case PTR_TO_ARENA:
+ if (BPF_MODE(insn->code) == BPF_MEMSX) {
+ if (!bpf_jit_supports_insn(insn, true)) {
+ verbose(env, "sign extending loads from arena are not supported yet\n");
+ return -EOPNOTSUPP;
+ }
+ insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32SX | BPF_SIZE(insn->code);
+ } else {
+ insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code);
+ }
+ env->prog->aux->num_exentries++;
+ continue;
+ default:
+ continue;
+ }
+
+ ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
+ size = BPF_LDST_BYTES(insn);
+ mode = BPF_MODE(insn->code);
+
+ /* If the read access is a narrower load of the field,
+ * convert to a 4/8-byte load, to minimum program type specific
+ * convert_ctx_access changes. If conversion is successful,
+ * we will apply proper mask to the result.
+ */
+ is_narrower_load = size < ctx_field_size;
+ size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
+ off = insn->off;
+ if (is_narrower_load) {
+ u8 size_code;
+
+ if (type == BPF_WRITE) {
+ verifier_bug(env, "narrow ctx access misconfigured");
+ return -EFAULT;
+ }
+
+ size_code = BPF_H;
+ if (ctx_field_size == 4)
+ size_code = BPF_W;
+ else if (ctx_field_size == 8)
+ size_code = BPF_DW;
+
+ insn->off = off & ~(size_default - 1);
+ insn->code = BPF_LDX | BPF_MEM | size_code;
+ }
+
+ target_size = 0;
+ cnt = convert_ctx_access(type, insn, insn_buf, env->prog,
+ &target_size);
+ if (cnt == 0 || cnt >= INSN_BUF_SIZE ||
+ (ctx_field_size && !target_size)) {
+ verifier_bug(env, "error during ctx access conversion (%d)", cnt);
+ return -EFAULT;
+ }
+
+ if (is_narrower_load && size < target_size) {
+ u8 shift = bpf_ctx_narrow_access_offset(
+ off, size, size_default) * 8;
+ if (shift && cnt + 1 >= INSN_BUF_SIZE) {
+ verifier_bug(env, "narrow ctx load misconfigured");
+ return -EFAULT;
+ }
+ if (ctx_field_size <= 4) {
+ if (shift)
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
+ insn->dst_reg,
+ shift);
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
+ (1 << size * 8) - 1);
+ } else {
+ if (shift)
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
+ insn->dst_reg,
+ shift);
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
+ (1ULL << size * 8) - 1);
+ }
+ }
+ if (mode == BPF_MEMSX)
+ insn_buf[cnt++] = BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X,
+ insn->dst_reg, insn->dst_reg,
+ size * 8, 0);
+
+patch_insn_buf:
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+
+ /* keep walking new program and skip insns we just inserted */
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ }
+
+ return 0;
+}
+
+int bpf_jit_subprogs(struct bpf_verifier_env *env)
+{
+ struct bpf_prog *prog = env->prog, **func, *tmp;
+ int i, j, subprog_start, subprog_end = 0, len, subprog;
+ struct bpf_map *map_ptr;
+ struct bpf_insn *insn;
+ void *old_bpf_func;
+ int err, num_exentries;
+ int old_len, subprog_start_adjustment = 0;
+
+ if (env->subprog_cnt <= 1)
+ return 0;
+
+ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn))
+ continue;
+
+ /* Upon error here we cannot fall back to interpreter but
+ * need a hard reject of the program. Thus -EFAULT is
+ * propagated in any case.
+ */
+ subprog = bpf_find_subprog(env, i + insn->imm + 1);
+ if (verifier_bug_if(subprog < 0, env, "No program to jit at insn %d",
+ i + insn->imm + 1))
+ return -EFAULT;
+ /* temporarily remember subprog id inside insn instead of
+ * aux_data, since next loop will split up all insns into funcs
+ */
+ insn->off = subprog;
+ /* remember original imm in case JIT fails and fallback
+ * to interpreter will be needed
+ */
+ env->insn_aux_data[i].call_imm = insn->imm;
+ /* point imm to __bpf_call_base+1 from JITs point of view */
+ insn->imm = 1;
+ if (bpf_pseudo_func(insn)) {
+#if defined(MODULES_VADDR)
+ u64 addr = MODULES_VADDR;
+#else
+ u64 addr = VMALLOC_START;
+#endif
+ /* jit (e.g. x86_64) may emit fewer instructions
+ * if it learns a u32 imm is the same as a u64 imm.
+ * Set close enough to possible prog address.
+ */
+ insn[0].imm = (u32)addr;
+ insn[1].imm = addr >> 32;
+ }
+ }
+
+ err = bpf_prog_alloc_jited_linfo(prog);
+ if (err)
+ goto out_undo_insn;
+
+ err = -ENOMEM;
+ func = kzalloc_objs(prog, env->subprog_cnt);
+ if (!func)
+ goto out_undo_insn;
+
+ for (i = 0; i < env->subprog_cnt; i++) {
+ subprog_start = subprog_end;
+ subprog_end = env->subprog_info[i + 1].start;
+
+ len = subprog_end - subprog_start;
+ /* bpf_prog_run() doesn't call subprogs directly,
+ * hence main prog stats include the runtime of subprogs.
+ * subprogs don't have IDs and not reachable via prog_get_next_id
+ * func[i]->stats will never be accessed and stays NULL
+ */
+ func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
+ if (!func[i])
+ goto out_free;
+ memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
+ len * sizeof(struct bpf_insn));
+ func[i]->type = prog->type;
+ func[i]->len = len;
+ if (bpf_prog_calc_tag(func[i]))
+ goto out_free;
+ func[i]->is_func = 1;
+ func[i]->sleepable = prog->sleepable;
+ func[i]->aux->func_idx = i;
+ /* Below members will be freed only at prog->aux */
+ func[i]->aux->btf = prog->aux->btf;
+ func[i]->aux->subprog_start = subprog_start + subprog_start_adjustment;
+ func[i]->aux->func_info = prog->aux->func_info;
+ func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
+ func[i]->aux->poke_tab = prog->aux->poke_tab;
+ func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
+ func[i]->aux->main_prog_aux = prog->aux;
+
+ for (j = 0; j < prog->aux->size_poke_tab; j++) {
+ struct bpf_jit_poke_descriptor *poke;
+
+ poke = &prog->aux->poke_tab[j];
+ if (poke->insn_idx < subprog_end &&
+ poke->insn_idx >= subprog_start)
+ poke->aux = func[i]->aux;
+ }
+
+ func[i]->aux->name[0] = 'F';
+ func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
+ if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE)
+ func[i]->aux->jits_use_priv_stack = true;
+
+ func[i]->jit_requested = 1;
+ func[i]->blinding_requested = prog->blinding_requested;
+ func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
+ func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab;
+ func[i]->aux->linfo = prog->aux->linfo;
+ func[i]->aux->nr_linfo = prog->aux->nr_linfo;
+ func[i]->aux->jited_linfo = prog->aux->jited_linfo;
+ func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
+ func[i]->aux->arena = prog->aux->arena;
+ func[i]->aux->used_maps = env->used_maps;
+ func[i]->aux->used_map_cnt = env->used_map_cnt;
+ num_exentries = 0;
+ insn = func[i]->insnsi;
+ for (j = 0; j < func[i]->len; j++, insn++) {
+ if (BPF_CLASS(insn->code) == BPF_LDX &&
+ (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEM32 ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEM32SX ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
+ num_exentries++;
+ if ((BPF_CLASS(insn->code) == BPF_STX ||
+ BPF_CLASS(insn->code) == BPF_ST) &&
+ BPF_MODE(insn->code) == BPF_PROBE_MEM32)
+ num_exentries++;
+ if (BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_PROBE_ATOMIC)
+ num_exentries++;
+ }
+ func[i]->aux->num_exentries = num_exentries;
+ func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
+ func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
+ func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data;
+ func[i]->aux->might_sleep = env->subprog_info[i].might_sleep;
+ if (!i)
+ func[i]->aux->exception_boundary = env->seen_exception;
+
+ /*
+ * To properly pass the absolute subprog start to jit
+ * all instruction adjustments should be accumulated
+ */
+ old_len = func[i]->len;
+ func[i] = bpf_int_jit_compile(func[i]);
+ subprog_start_adjustment += func[i]->len - old_len;
+
+ if (!func[i]->jited) {
+ err = -ENOTSUPP;
+ goto out_free;
+ }
+ cond_resched();
+ }
+
+ /* at this point all bpf functions were successfully JITed
+ * now populate all bpf_calls with correct addresses and
+ * run last pass of JIT
+ */
+ for (i = 0; i < env->subprog_cnt; i++) {
+ insn = func[i]->insnsi;
+ for (j = 0; j < func[i]->len; j++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ subprog = insn->off;
+ insn[0].imm = (u32)(long)func[subprog]->bpf_func;
+ insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
+ continue;
+ }
+ if (!bpf_pseudo_call(insn))
+ continue;
+ subprog = insn->off;
+ insn->imm = BPF_CALL_IMM(func[subprog]->bpf_func);
+ }
+
+ /* we use the aux data to keep a list of the start addresses
+ * of the JITed images for each function in the program
+ *
+ * for some architectures, such as powerpc64, the imm field
+ * might not be large enough to hold the offset of the start
+ * address of the callee's JITed image from __bpf_call_base
+ *
+ * in such cases, we can lookup the start address of a callee
+ * by using its subprog id, available from the off field of
+ * the call instruction, as an index for this list
+ */
+ func[i]->aux->func = func;
+ func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+ func[i]->aux->real_func_cnt = env->subprog_cnt;
+ }
+ for (i = 0; i < env->subprog_cnt; i++) {
+ old_bpf_func = func[i]->bpf_func;
+ tmp = bpf_int_jit_compile(func[i]);
+ if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
+ verbose(env, "JIT doesn't support bpf-to-bpf calls\n");
+ err = -ENOTSUPP;
+ goto out_free;
+ }
+ cond_resched();
+ }
+
+ /*
+ * Cleanup func[i]->aux fields which aren't required
+ * or can become invalid in future
+ */
+ for (i = 0; i < env->subprog_cnt; i++) {
+ func[i]->aux->used_maps = NULL;
+ func[i]->aux->used_map_cnt = 0;
+ }
+
+ /* finally lock prog and jit images for all functions and
+ * populate kallsysm. Begin at the first subprogram, since
+ * bpf_prog_load will add the kallsyms for the main program.
+ */
+ for (i = 1; i < env->subprog_cnt; i++) {
+ err = bpf_prog_lock_ro(func[i]);
+ if (err)
+ goto out_free;
+ }
+
+ for (i = 1; i < env->subprog_cnt; i++)
+ bpf_prog_kallsyms_add(func[i]);
+
+ /* Last step: make now unused interpreter insns from main
+ * prog consistent for later dump requests, so they can
+ * later look the same as if they were interpreted only.
+ */
+ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ insn[0].imm = env->insn_aux_data[i].call_imm;
+ insn[1].imm = insn->off;
+ insn->off = 0;
+ continue;
+ }
+ if (!bpf_pseudo_call(insn))
+ continue;
+ insn->off = env->insn_aux_data[i].call_imm;
+ subprog = bpf_find_subprog(env, i + insn->off + 1);
+ insn->imm = subprog;
+ }
+
+ prog->jited = 1;
+ prog->bpf_func = func[0]->bpf_func;
+ prog->jited_len = func[0]->jited_len;
+ prog->aux->extable = func[0]->aux->extable;
+ prog->aux->num_exentries = func[0]->aux->num_exentries;
+ prog->aux->func = func;
+ prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+ prog->aux->real_func_cnt = env->subprog_cnt;
+ prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func;
+ prog->aux->exception_boundary = func[0]->aux->exception_boundary;
+ bpf_prog_jit_attempt_done(prog);
+ return 0;
+out_free:
+ /* We failed JIT'ing, so at this point we need to unregister poke
+ * descriptors from subprogs, so that kernel is not attempting to
+ * patch it anymore as we're freeing the subprog JIT memory.
+ */
+ for (i = 0; i < prog->aux->size_poke_tab; i++) {
+ map_ptr = prog->aux->poke_tab[i].tail_call.map;
+ map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
+ }
+ /* At this point we're guaranteed that poke descriptors are not
+ * live anymore. We can just unlink its descriptor table as it's
+ * released with the main prog.
+ */
+ for (i = 0; i < env->subprog_cnt; i++) {
+ if (!func[i])
+ continue;
+ func[i]->aux->poke_tab = NULL;
+ bpf_jit_free(func[i]);
+ }
+ kfree(func);
+out_undo_insn:
+ /* cleanup main prog to be interpreted */
+ prog->jit_requested = 0;
+ prog->blinding_requested = 0;
+ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ if (!bpf_pseudo_call(insn))
+ continue;
+ insn->off = 0;
+ insn->imm = env->insn_aux_data[i].call_imm;
+ }
+ bpf_prog_jit_attempt_done(prog);
+ return err;
+}
+
+int bpf_fixup_call_args(struct bpf_verifier_env *env)
+{
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+ struct bpf_prog *prog = env->prog;
+ struct bpf_insn *insn = prog->insnsi;
+ bool has_kfunc_call = bpf_prog_has_kfunc_call(prog);
+ int i, depth;
+#endif
+ int err = 0;
+
+ if (env->prog->jit_requested &&
+ !bpf_prog_is_offloaded(env->prog->aux)) {
+ err = bpf_jit_subprogs(env);
+ if (err == 0)
+ return 0;
+ if (err == -EFAULT)
+ return err;
+ }
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+ if (has_kfunc_call) {
+ verbose(env, "calling kernel functions are not allowed in non-JITed programs\n");
+ return -EINVAL;
+ }
+ if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
+ /* When JIT fails the progs with bpf2bpf calls and tail_calls
+ * have to be rejected, since interpreter doesn't support them yet.
+ */
+ verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
+ return -EINVAL;
+ }
+ for (i = 0; i < prog->len; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ /* When JIT fails the progs with callback calls
+ * have to be rejected, since interpreter doesn't support them yet.
+ */
+ verbose(env, "callbacks are not allowed in non-JITed programs\n");
+ return -EINVAL;
+ }
+
+ if (!bpf_pseudo_call(insn))
+ continue;
+ depth = get_callee_stack_depth(env, insn, i);
+ if (depth < 0)
+ return depth;
+ bpf_patch_call_args(insn, depth);
+ }
+ err = 0;
+#endif
+ return err;
+}
+
+
+/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
+static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
+{
+ struct bpf_subprog_info *info = env->subprog_info;
+ int cnt = env->subprog_cnt;
+ struct bpf_prog *prog;
+
+ /* We only reserve one slot for hidden subprogs in subprog_info. */
+ if (env->hidden_subprog_cnt) {
+ verifier_bug(env, "only one hidden subprog supported");
+ return -EFAULT;
+ }
+ /* We're not patching any existing instruction, just appending the new
+ * ones for the hidden subprog. Hence all of the adjustment operations
+ * in bpf_patch_insn_data are no-ops.
+ */
+ prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len);
+ if (!prog)
+ return -ENOMEM;
+ env->prog = prog;
+ info[cnt + 1].start = info[cnt].start;
+ info[cnt].start = prog->len - len + 1;
+ env->subprog_cnt++;
+ env->hidden_subprog_cnt++;
+ return 0;
+}
+
+/* Do various post-verification rewrites in a single program pass.
+ * These rewrites simplify JIT and interpreter implementations.
+ */
+int bpf_do_misc_fixups(struct bpf_verifier_env *env)
+{
+ struct bpf_prog *prog = env->prog;
+ enum bpf_attach_type eatype = prog->expected_attach_type;
+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
+ struct bpf_insn *insn = prog->insnsi;
+ const struct bpf_func_proto *fn;
+ const int insn_cnt = prog->len;
+ const struct bpf_map_ops *ops;
+ struct bpf_insn_aux_data *aux;
+ struct bpf_insn *insn_buf = env->insn_buf;
+ struct bpf_prog *new_prog;
+ struct bpf_map *map_ptr;
+ int i, ret, cnt, delta = 0, cur_subprog = 0;
+ struct bpf_subprog_info *subprogs = env->subprog_info;
+ u16 stack_depth = subprogs[cur_subprog].stack_depth;
+ u16 stack_depth_extra = 0;
+
+ if (env->seen_exception && !env->exception_callback_subprog) {
+ struct bpf_insn *patch = insn_buf;
+
+ *patch++ = env->prog->insnsi[insn_cnt - 1];
+ *patch++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
+ *patch++ = BPF_EXIT_INSN();
+ ret = add_hidden_subprog(env, insn_buf, patch - insn_buf);
+ if (ret < 0)
+ return ret;
+ prog = env->prog;
+ insn = prog->insnsi;
+
+ env->exception_callback_subprog = env->subprog_cnt - 1;
+ /* Don't update insn_cnt, as add_hidden_subprog always appends insns */
+ bpf_mark_subprog_exc_cb(env, env->exception_callback_subprog);
+ }
+
+ for (i = 0; i < insn_cnt;) {
+ if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) {
+ if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) ||
+ (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) {
+ /* convert to 32-bit mov that clears upper 32-bit */
+ insn->code = BPF_ALU | BPF_MOV | BPF_X;
+ /* clear off and imm, so it's a normal 'wX = wY' from JIT pov */
+ insn->off = 0;
+ insn->imm = 0;
+ } /* cast from as(0) to as(1) should be handled by JIT */
+ goto next_insn;
+ }
+
+ if (env->insn_aux_data[i + delta].needs_zext)
+ /* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */
+ insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code);
+
+ /* Make sdiv/smod divide-by-minus-one exceptions impossible. */
+ if ((insn->code == (BPF_ALU64 | BPF_MOD | BPF_K) ||
+ insn->code == (BPF_ALU64 | BPF_DIV | BPF_K) ||
+ insn->code == (BPF_ALU | BPF_MOD | BPF_K) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_K)) &&
+ insn->off == 1 && insn->imm == -1) {
+ bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+ bool isdiv = BPF_OP(insn->code) == BPF_DIV;
+ struct bpf_insn *patch = insn_buf;
+
+ if (isdiv)
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_NEG | BPF_K, insn->dst_reg,
+ 0, 0, 0);
+ else
+ *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0);
+
+ cnt = patch - insn_buf;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Make divide-by-zero and divide-by-minus-one exceptions impossible. */
+ if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
+ insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
+ insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
+ bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+ bool isdiv = BPF_OP(insn->code) == BPF_DIV;
+ bool is_sdiv = isdiv && insn->off == 1;
+ bool is_smod = !isdiv && insn->off == 1;
+ struct bpf_insn *patch = insn_buf;
+
+ if (is_sdiv) {
+ /* [R,W]x sdiv 0 -> 0
+ * LLONG_MIN sdiv -1 -> LLONG_MIN
+ * INT_MIN sdiv -1 -> INT_MIN
+ */
+ *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_ADD | BPF_K, BPF_REG_AX,
+ 0, 0, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JGT | BPF_K, BPF_REG_AX,
+ 0, 4, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, BPF_REG_AX,
+ 0, 1, 0);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_MOV | BPF_K, insn->dst_reg,
+ 0, 0, 0);
+ /* BPF_NEG(LLONG_MIN) == -LLONG_MIN == LLONG_MIN */
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_NEG | BPF_K, insn->dst_reg,
+ 0, 0, 0);
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = *insn;
+ cnt = patch - insn_buf;
+ } else if (is_smod) {
+ /* [R,W]x mod 0 -> [R,W]x */
+ /* [R,W]x mod -1 -> 0 */
+ *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_ADD | BPF_K, BPF_REG_AX,
+ 0, 0, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JGT | BPF_K, BPF_REG_AX,
+ 0, 3, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, BPF_REG_AX,
+ 0, 3 + (is64 ? 0 : 1), 1);
+ *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0);
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = *insn;
+
+ if (!is64) {
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg);
+ }
+ cnt = patch - insn_buf;
+ } else if (isdiv) {
+ /* [R,W]x div 0 -> 0 */
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JNE | BPF_K, insn->src_reg,
+ 0, 2, 0);
+ *patch++ = BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg);
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = *insn;
+ cnt = patch - insn_buf;
+ } else {
+ /* [R,W]x mod 0 -> [R,W]x */
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, insn->src_reg,
+ 0, 1 + (is64 ? 0 : 1), 0);
+ *patch++ = *insn;
+
+ if (!is64) {
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg);
+ }
+ cnt = patch - insn_buf;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Make it impossible to de-reference a userspace address */
+ if (BPF_CLASS(insn->code) == BPF_LDX &&
+ (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
+ struct bpf_insn *patch = insn_buf;
+ u64 uaddress_limit = bpf_arch_uaddress_limit();
+
+ if (!uaddress_limit)
+ goto next_insn;
+
+ *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+ if (insn->off)
+ *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
+ *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
+ *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
+ *patch++ = *insn;
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
+
+ cnt = patch - insn_buf;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
+ if (BPF_CLASS(insn->code) == BPF_LD &&
+ (BPF_MODE(insn->code) == BPF_ABS ||
+ BPF_MODE(insn->code) == BPF_IND)) {
+ cnt = env->ops->gen_ld_abs(insn, insn_buf);
+ if (cnt == 0 || cnt >= INSN_BUF_SIZE) {
+ verifier_bug(env, "%d insns generated for ld_abs", cnt);
+ return -EFAULT;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Rewrite pointer arithmetic to mitigate speculation attacks. */
+ if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
+ insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
+ const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
+ const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
+ struct bpf_insn *patch = insn_buf;
+ bool issrc, isneg, isimm;
+ u32 off_reg;
+
+ aux = &env->insn_aux_data[i + delta];
+ if (!aux->alu_state ||
+ aux->alu_state == BPF_ALU_NON_POINTER)
+ goto next_insn;
+
+ isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
+ issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
+ BPF_ALU_SANITIZE_SRC;
+ isimm = aux->alu_state & BPF_ALU_IMMEDIATE;
+
+ off_reg = issrc ? insn->src_reg : insn->dst_reg;
+ if (isimm) {
+ *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
+ } else {
+ if (isneg)
+ *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+ *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
+ *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
+ *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
+ *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
+ *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
+ *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg);
+ }
+ if (!issrc)
+ *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg);
+ insn->src_reg = BPF_REG_AX;
+ if (isneg)
+ insn->code = insn->code == code_add ?
+ code_sub : code_add;
+ *patch++ = *insn;
+ if (issrc && isneg && !isimm)
+ *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+ cnt = patch - insn_buf;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ if (bpf_is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) {
+ int stack_off_cnt = -stack_depth - 16;
+
+ /*
+ * Two 8 byte slots, depth-16 stores the count, and
+ * depth-8 stores the start timestamp of the loop.
+ *
+ * The starting value of count is BPF_MAX_TIMED_LOOPS
+ * (0xffff). Every iteration loads it and subs it by 1,
+ * until the value becomes 0 in AX (thus, 1 in stack),
+ * after which we call arch_bpf_timed_may_goto, which
+ * either sets AX to 0xffff to keep looping, or to 0
+ * upon timeout. AX is then stored into the stack. In
+ * the next iteration, we either see 0 and break out, or
+ * continue iterating until the next time value is 0
+ * after subtraction, rinse and repeat.
+ */
+ stack_depth_extra = 16;
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt);
+ if (insn->off >= 0)
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5);
+ else
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
+ insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2);
+ /*
+ * AX is used as an argument to pass in stack_off_cnt
+ * (to add to r10/fp), and also as the return value of
+ * the call to arch_bpf_timed_may_goto.
+ */
+ insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt);
+ insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto);
+ insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt);
+ cnt = 7;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ } else if (bpf_is_may_goto_insn(insn)) {
+ int stack_off = -stack_depth - 8;
+
+ stack_depth_extra = 8;
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off);
+ if (insn->off >= 0)
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2);
+ else
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
+ insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off);
+ cnt = 4;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ if (insn->code != (BPF_JMP | BPF_CALL))
+ goto next_insn;
+ if (insn->src_reg == BPF_PSEUDO_CALL)
+ goto next_insn;
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ ret = bpf_fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
+ if (ret)
+ return ret;
+ if (cnt == 0)
+ goto next_insn;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Skip inlining the helper call if the JIT does it. */
+ if (bpf_jit_inlines_helper_call(insn->imm))
+ goto next_insn;
+
+ if (insn->imm == BPF_FUNC_get_route_realm)
+ prog->dst_needed = 1;
+ if (insn->imm == BPF_FUNC_get_prandom_u32)
+ bpf_user_rnd_init_once();
+ if (insn->imm == BPF_FUNC_override_return)
+ prog->kprobe_override = 1;
+ if (insn->imm == BPF_FUNC_tail_call) {
+ /* If we tail call into other programs, we
+ * cannot make any assumptions since they can
+ * be replaced dynamically during runtime in
+ * the program array.
+ */
+ prog->cb_access = 1;
+ if (!bpf_allow_tail_call_in_subprogs(env))
+ prog->aux->stack_depth = MAX_BPF_STACK;
+ prog->aux->max_pkt_offset = MAX_PACKET_OFF;
+
+ /* mark bpf_tail_call as different opcode to avoid
+ * conditional branch in the interpreter for every normal
+ * call and to prevent accidental JITing by JIT compiler
+ * that doesn't support bpf_tail_call yet
+ */
+ insn->imm = 0;
+ insn->code = BPF_JMP | BPF_TAIL_CALL;
+
+ aux = &env->insn_aux_data[i + delta];
+ if (env->bpf_capable && !prog->blinding_requested &&
+ prog->jit_requested &&
+ !bpf_map_key_poisoned(aux) &&
+ !bpf_map_ptr_poisoned(aux) &&
+ !bpf_map_ptr_unpriv(aux)) {
+ struct bpf_jit_poke_descriptor desc = {
+ .reason = BPF_POKE_REASON_TAIL_CALL,
+ .tail_call.map = aux->map_ptr_state.map_ptr,
+ .tail_call.key = bpf_map_key_immediate(aux),
+ .insn_idx = i + delta,
+ };
+
+ ret = bpf_jit_add_poke_descriptor(prog, &desc);
+ if (ret < 0) {
+ verbose(env, "adding tail call poke descriptor failed\n");
+ return ret;
+ }
+
+ insn->imm = ret + 1;
+ goto next_insn;
+ }
+
+ if (!bpf_map_ptr_unpriv(aux))
+ goto next_insn;
+
+ /* instead of changing every JIT dealing with tail_call
+ * emit two extra insns:
+ * if (index >= max_entries) goto out;
+ * index &= array->index_mask;
+ * to avoid out-of-bounds cpu speculation
+ */
+ if (bpf_map_ptr_poisoned(aux)) {
+ verbose(env, "tail_call abusing map_ptr\n");
+ return -EINVAL;
+ }
+
+ map_ptr = aux->map_ptr_state.map_ptr;
+ insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
+ map_ptr->max_entries, 2);
+ insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
+ container_of(map_ptr,
+ struct bpf_array,
+ map)->index_mask);
+ insn_buf[2] = *insn;
+ cnt = 3;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ if (insn->imm == BPF_FUNC_timer_set_callback) {
+ /* The verifier will process callback_fn as many times as necessary
+ * with different maps and the register states prepared by
+ * set_timer_callback_state will be accurate.
+ *
+ * The following use case is valid:
+ * map1 is shared by prog1, prog2, prog3.
+ * prog1 calls bpf_timer_init for some map1 elements
+ * prog2 calls bpf_timer_set_callback for some map1 elements.
+ * Those that were not bpf_timer_init-ed will return -EINVAL.
+ * prog3 calls bpf_timer_start for some map1 elements.
+ * Those that were not both bpf_timer_init-ed and
+ * bpf_timer_set_callback-ed will return -EINVAL.
+ */
+ struct bpf_insn ld_addrs[2] = {
+ BPF_LD_IMM64(BPF_REG_3, (long)prog->aux),
+ };
+
+ insn_buf[0] = ld_addrs[0];
+ insn_buf[1] = ld_addrs[1];
+ insn_buf[2] = *insn;
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
+ /* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */
+ if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) {
+ /* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data,
+ * bpf_mem_alloc() returns a ptr to the percpu data ptr.
+ */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
+ insn_buf[1] = *insn;
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
+ /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
+ * and other inlining handlers are currently limited to 64 bit
+ * only.
+ */
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
+ (insn->imm == BPF_FUNC_map_lookup_elem ||
+ insn->imm == BPF_FUNC_map_update_elem ||
+ insn->imm == BPF_FUNC_map_delete_elem ||
+ insn->imm == BPF_FUNC_map_push_elem ||
+ insn->imm == BPF_FUNC_map_pop_elem ||
+ insn->imm == BPF_FUNC_map_peek_elem ||
+ insn->imm == BPF_FUNC_redirect_map ||
+ insn->imm == BPF_FUNC_for_each_map_elem ||
+ insn->imm == BPF_FUNC_map_lookup_percpu_elem)) {
+ aux = &env->insn_aux_data[i + delta];
+ if (bpf_map_ptr_poisoned(aux))
+ goto patch_call_imm;
+
+ map_ptr = aux->map_ptr_state.map_ptr;
+ ops = map_ptr->ops;
+ if (insn->imm == BPF_FUNC_map_lookup_elem &&
+ ops->map_gen_lookup) {
+ cnt = ops->map_gen_lookup(map_ptr, insn_buf);
+ if (cnt == -EOPNOTSUPP)
+ goto patch_map_ops_generic;
+ if (cnt <= 0 || cnt >= INSN_BUF_SIZE) {
+ verifier_bug(env, "%d insns generated for map lookup", cnt);
+ return -EFAULT;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta,
+ insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
+ (long (*)(struct bpf_map *map, void *key))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_update_elem,
+ (long (*)(struct bpf_map *map, void *key, void *value,
+ u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_push_elem,
+ (long (*)(struct bpf_map *map, void *value,
+ u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
+ (long (*)(struct bpf_map *map, void *value))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
+ (long (*)(struct bpf_map *map, void *value))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_redirect,
+ (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
+ (long (*)(struct bpf_map *map,
+ bpf_callback_t callback_fn,
+ void *callback_ctx,
+ u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem,
+ (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL));
+
+patch_map_ops_generic:
+ switch (insn->imm) {
+ case BPF_FUNC_map_lookup_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_lookup_elem);
+ goto next_insn;
+ case BPF_FUNC_map_update_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_update_elem);
+ goto next_insn;
+ case BPF_FUNC_map_delete_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_delete_elem);
+ goto next_insn;
+ case BPF_FUNC_map_push_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_push_elem);
+ goto next_insn;
+ case BPF_FUNC_map_pop_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_pop_elem);
+ goto next_insn;
+ case BPF_FUNC_map_peek_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_peek_elem);
+ goto next_insn;
+ case BPF_FUNC_redirect_map:
+ insn->imm = BPF_CALL_IMM(ops->map_redirect);
+ goto next_insn;
+ case BPF_FUNC_for_each_map_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
+ goto next_insn;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem);
+ goto next_insn;
+ }
+
+ goto patch_call_imm;
+ }
+
+ /* Implement bpf_jiffies64 inline. */
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_jiffies64) {
+ struct bpf_insn ld_jiffies_addr[2] = {
+ BPF_LD_IMM64(BPF_REG_0,
+ (unsigned long)&jiffies),
+ };
+
+ insn_buf[0] = ld_jiffies_addr[0];
+ insn_buf[1] = ld_jiffies_addr[1];
+ insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0,
+ BPF_REG_0, 0);
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
+ cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+ /* Implement bpf_get_smp_processor_id() inline. */
+ if (insn->imm == BPF_FUNC_get_smp_processor_id &&
+ bpf_verifier_inlines_helper_call(env, insn->imm)) {
+ /* BPF_FUNC_get_smp_processor_id inlining is an
+ * optimization, so if cpu_number is ever
+ * changed in some incompatible and hard to support
+ * way, it's fine to back out this inlining logic
+ */
+#ifdef CONFIG_SMP
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&cpu_number);
+ insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+ insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0);
+ cnt = 3;
+#else
+ insn_buf[0] = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
+ cnt = 1;
+#endif
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement bpf_get_current_task() and bpf_get_current_task_btf() inline. */
+ if ((insn->imm == BPF_FUNC_get_current_task || insn->imm == BPF_FUNC_get_current_task_btf) &&
+ bpf_verifier_inlines_helper_call(env, insn->imm)) {
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&current_task);
+ insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+ insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+#endif
+ /* Implement bpf_get_func_arg inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_arg) {
+ if (eatype == BPF_TRACE_RAW_TP) {
+ int nr_args = btf_type_vlen(prog->aux->attach_func_proto);
+
+ /* skip 'void *__data' in btf_trace_##name() and save to reg0 */
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1);
+ cnt = 1;
+ } else {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
+ cnt = 2;
+ }
+ insn_buf[cnt++] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6);
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3);
+ insn_buf[cnt++] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1);
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0);
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ insn_buf[cnt++] = BPF_JMP_A(1);
+ insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement bpf_get_func_ret inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_ret) {
+ if (eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_TRACE_FSESSION ||
+ eatype == BPF_MODIFY_RETURN) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
+ insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0);
+ insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ cnt = 7;
+ } else {
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP);
+ cnt = 1;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement get_func_arg_cnt inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_arg_cnt) {
+ if (eatype == BPF_TRACE_RAW_TP) {
+ int nr_args = btf_type_vlen(prog->aux->attach_func_proto);
+
+ /* skip 'void *__data' in btf_trace_##name() and save to reg0 */
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1);
+ cnt = 1;
+ } else {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
+ cnt = 2;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement bpf_get_func_ip inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_ip) {
+ /* Load IP address from ctx - 16 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+ if (!new_prog)
+ return -ENOMEM;
+
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement bpf_get_branch_snapshot inline. */
+ if (IS_ENABLED(CONFIG_PERF_EVENTS) &&
+ prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_get_branch_snapshot) {
+ /* We are dealing with the following func protos:
+ * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags);
+ * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt);
+ */
+ const u32 br_entry_size = sizeof(struct perf_branch_entry);
+
+ /* struct perf_branch_entry is part of UAPI and is
+ * used as an array element, so extremely unlikely to
+ * ever grow or shrink
+ */
+ BUILD_BUG_ON(br_entry_size != 24);
+
+ /* if (unlikely(flags)) return -EINVAL */
+ insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7);
+
+ /* Transform size (bytes) into number of entries (cnt = size / 24).
+ * But to avoid expensive division instruction, we implement
+ * divide-by-3 through multiplication, followed by further
+ * division by 8 through 3-bit right shift.
+ * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr.,
+ * p. 227, chapter "Unsigned Division by 3" for details and proofs.
+ *
+ * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab.
+ */
+ insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab);
+ insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0);
+ insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36);
+
+ /* call perf_snapshot_branch_stack implementation */
+ insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack));
+ /* if (entry_cnt == 0) return -ENOENT */
+ insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4);
+ /* return entry_cnt * sizeof(struct perf_branch_entry) */
+ insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size);
+ insn_buf[7] = BPF_JMP_A(3);
+ /* return -EINVAL; */
+ insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
+ insn_buf[9] = BPF_JMP_A(1);
+ /* return -ENOENT; */
+ insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT);
+ cnt = 11;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement bpf_kptr_xchg inline */
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_kptr_xchg &&
+ bpf_jit_supports_ptr_xchg()) {
+ insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_2);
+ insn_buf[1] = BPF_ATOMIC_OP(BPF_DW, BPF_XCHG, BPF_REG_1, BPF_REG_0, 0);
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+patch_call_imm:
+ fn = env->ops->get_func_proto(insn->imm, env->prog);
+ /* all functions that have prototype and verifier allowed
+ * programs to call them, must be real in-kernel functions
+ */
+ if (!fn->func) {
+ verifier_bug(env,
+ "not inlined functions %s#%d is missing func",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ }
+ insn->imm = fn->func - __bpf_call_base;
+next_insn:
+ if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+ subprogs[cur_subprog].stack_depth += stack_depth_extra;
+ subprogs[cur_subprog].stack_extra = stack_depth_extra;
+
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ if (stack_depth > MAX_BPF_STACK && !prog->jit_requested) {
+ verbose(env, "stack size %d(extra %d) is too large\n",
+ stack_depth, stack_depth_extra);
+ return -EINVAL;
+ }
+ cur_subprog++;
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ stack_depth_extra = 0;
+ }
+ i++;
+ insn++;
+ }
+
+ env->prog->aux->stack_depth = subprogs[0].stack_depth;
+ for (i = 0; i < env->subprog_cnt; i++) {
+ int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1;
+ int subprog_start = subprogs[i].start;
+ int stack_slots = subprogs[i].stack_extra / 8;
+ int slots = delta, cnt = 0;
+
+ if (!stack_slots)
+ continue;
+ /* We need two slots in case timed may_goto is supported. */
+ if (stack_slots > slots) {
+ verifier_bug(env, "stack_slots supports may_goto only");
+ return -EFAULT;
+ }
+
+ stack_depth = subprogs[i].stack_depth;
+ if (bpf_jit_supports_timed_may_goto()) {
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
+ BPF_MAX_TIMED_LOOPS);
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0);
+ } else {
+ /* Add ST insn to subprog prologue to init extra stack */
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
+ BPF_MAX_LOOPS);
+ }
+ /* Copy first actual insn to preserve it */
+ insn_buf[cnt++] = env->prog->insnsi[subprog_start];
+
+ new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = prog = new_prog;
+ /*
+ * If may_goto is a first insn of a prog there could be a jmp
+ * insn that points to it, hence adjust all such jmps to point
+ * to insn after BPF_ST that inits may_goto count.
+ * Adjustment will succeed because bpf_patch_insn_data() didn't fail.
+ */
+ WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta));
+ }
+
+ /* Since poke tab is now finalized, publish aux to tracker. */
+ for (i = 0; i < prog->aux->size_poke_tab; i++) {
+ map_ptr = prog->aux->poke_tab[i].tail_call.map;
+ if (!map_ptr->ops->map_poke_track ||
+ !map_ptr->ops->map_poke_untrack ||
+ !map_ptr->ops->map_poke_run) {
+ verifier_bug(env, "poke tab is misconfigured");
+ return -EFAULT;
+ }
+
+ ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux);
+ if (ret < 0) {
+ verbose(env, "tracking tail call prog failed\n");
+ return ret;
+ }
+ }
+
+ ret = sort_kfunc_descs_by_imm_off(env);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
+ int position,
+ s32 stack_base,
+ u32 callback_subprogno,
+ u32 *total_cnt)
+{
+ s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
+ s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
+ s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
+ int reg_loop_max = BPF_REG_6;
+ int reg_loop_cnt = BPF_REG_7;
+ int reg_loop_ctx = BPF_REG_8;
+
+ struct bpf_insn *insn_buf = env->insn_buf;
+ struct bpf_prog *new_prog;
+ u32 callback_start;
+ u32 call_insn_offset;
+ s32 callback_offset;
+ u32 cnt = 0;
+
+ /* This represents an inlined version of bpf_iter.c:bpf_loop,
+ * be careful to modify this code in sync.
+ */
+
+ /* Return error and jump to the end of the patch if
+ * expected number of iterations is too big.
+ */
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2);
+ insn_buf[cnt++] = BPF_MOV32_IMM(BPF_REG_0, -E2BIG);
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JA, 0, 0, 16);
+ /* spill R6, R7, R8 to use these as loop vars */
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset);
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset);
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset);
+ /* initialize loop vars */
+ insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_max, BPF_REG_1);
+ insn_buf[cnt++] = BPF_MOV32_IMM(reg_loop_cnt, 0);
+ insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3);
+ /* loop header,
+ * if reg_loop_cnt >= reg_loop_max skip the loop body
+ */
+ insn_buf[cnt++] = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5);
+ /* callback call,
+ * correct callback offset would be set after patching
+ */
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt);
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx);
+ insn_buf[cnt++] = BPF_CALL_REL(0);
+ /* increment loop counter */
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1);
+ /* jump to loop header if callback returned 0 */
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6);
+ /* return value of bpf_loop,
+ * set R0 to the number of iterations
+ */
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt);
+ /* restore original values of R6, R7, R8 */
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset);
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset);
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset);
+
+ *total_cnt = cnt;
+ new_prog = bpf_patch_insn_data(env, position, insn_buf, cnt);
+ if (!new_prog)
+ return new_prog;
+
+ /* callback start is known only after patching */
+ callback_start = env->subprog_info[callback_subprogno].start;
+ /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */
+ call_insn_offset = position + 12;
+ callback_offset = callback_start - call_insn_offset - 1;
+ new_prog->insnsi[call_insn_offset].imm = callback_offset;
+
+ return new_prog;
+}
+
+static bool is_bpf_loop_call(struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0 &&
+ insn->imm == BPF_FUNC_loop;
+}
+
+/* For all sub-programs in the program (including main) check
+ * insn_aux_data to see if there are bpf_loop calls that require
+ * inlining. If such calls are found the calls are replaced with a
+ * sequence of instructions produced by `inline_bpf_loop` function and
+ * subprog stack_depth is increased by the size of 3 registers.
+ * This stack space is used to spill values of the R6, R7, R8. These
+ * registers are used to store the loop bound, counter and context
+ * variables.
+ */
+int bpf_optimize_bpf_loop(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprogs = env->subprog_info;
+ int i, cur_subprog = 0, cnt, delta = 0;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ u16 stack_depth = subprogs[cur_subprog].stack_depth;
+ u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
+ u16 stack_depth_extra = 0;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ struct bpf_loop_inline_state *inline_state =
+ &env->insn_aux_data[i + delta].loop_inline_state;
+
+ if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) {
+ struct bpf_prog *new_prog;
+
+ stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup;
+ new_prog = inline_bpf_loop(env,
+ i + delta,
+ -(stack_depth + stack_depth_extra),
+ inline_state->callback_subprogno,
+ &cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ }
+
+ if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+ subprogs[cur_subprog].stack_depth += stack_depth_extra;
+ cur_subprog++;
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
+ stack_depth_extra = 0;
+ }
+ }
+
+ env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
+
+ return 0;
+}
+
+/* Remove unnecessary spill/fill pairs, members of fastcall pattern,
+ * adjust subprograms stack depth when possible.
+ */
+int bpf_remove_fastcall_spills_fills(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ u32 spills_num;
+ bool modified = false;
+ int i, j;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (aux[i].fastcall_spills_num > 0) {
+ spills_num = aux[i].fastcall_spills_num;
+ /* NOPs would be removed by opt_remove_nops() */
+ for (j = 1; j <= spills_num; ++j) {
+ *(insn - j) = NOP;
+ *(insn + j) = NOP;
+ }
+ modified = true;
+ }
+ if ((subprog + 1)->start == i + 1) {
+ if (modified && !subprog->keep_fastcall_stack)
+ subprog->stack_depth = -subprog->fastcall_stack_off;
+ subprog++;
+ modified = false;
+ }
+ }
+
+ return 0;
+}
+
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index bc6bc8bb871d..3dd9b4924ae4 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1056,7 +1056,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
for_each_possible_cpu(cpu) {
if (cpu == current_cpu)
- copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value);
+ copy_map_value(&htab->map, per_cpu_ptr(pptr, cpu), value);
else /* Since elem is preallocated, we cannot touch special fields */
zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
}
@@ -1138,6 +1138,10 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
} else if (fd_htab_map_needs_adjust(htab)) {
size = round_up(size, 8);
memcpy(htab_elem_value(l_new, key_size), value, size);
+ } else if (map_flags & BPF_F_LOCK) {
+ copy_map_value_locked(&htab->map,
+ htab_elem_value(l_new, key_size),
+ value, false);
} else {
copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value);
}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6eb6c82ed2ee..bb95e287b0dc 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1272,7 +1272,7 @@ static void bpf_async_cb_rcu_tasks_trace_free(struct rcu_head *rcu)
return;
}
- /* rcu_trace_implies_rcu_gp() is true and will remain so */
+ /* RCU Tasks Trace grace period implies RCU grace period. */
bpf_async_cb_rcu_free(rcu);
}
@@ -2302,9 +2302,20 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
__bpf_kfunc_start_defs();
-__bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+/**
+ * bpf_obj_new() - allocate an object described by program BTF
+ * @local_type_id__k: type ID in program BTF
+ * @meta: verifier-supplied struct metadata
+ *
+ * Allocate an object of the type identified by @local_type_id__k and
+ * initialize its special fields. BPF programs can use
+ * bpf_core_type_id_local() to provide @local_type_id__k. The verifier
+ * rewrites @meta; BPF programs do not set it.
+ *
+ * Return: Pointer to the allocated object, or %NULL on failure.
+ */
+__bpf_kfunc void *bpf_obj_new(u64 local_type_id__k, struct btf_struct_meta *meta)
{
- struct btf_struct_meta *meta = meta__ign;
u64 size = local_type_id__k;
void *p;
@@ -2313,17 +2324,39 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
return NULL;
if (meta)
bpf_obj_init(meta->record, p);
+
return p;
}
-__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+__bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+ return bpf_obj_new(local_type_id__k, meta__ign);
+}
+
+/**
+ * bpf_percpu_obj_new() - allocate a percpu object described by program BTF
+ * @local_type_id__k: type ID in program BTF
+ * @meta: verifier-supplied struct metadata
+ *
+ * Allocate a percpu object of the type identified by @local_type_id__k. BPF
+ * programs can use bpf_core_type_id_local() to provide @local_type_id__k.
+ * The verifier rewrites @meta; BPF programs do not set it.
+ *
+ * Return: Pointer to the allocated percpu object, or %NULL on failure.
+ */
+__bpf_kfunc void *bpf_percpu_obj_new(u64 local_type_id__k, struct btf_struct_meta *meta)
{
u64 size = local_type_id__k;
- /* The verifier has ensured that meta__ign must be NULL */
+ /* The verifier has ensured that meta must be NULL */
return bpf_mem_alloc(&bpf_global_percpu_ma, size);
}
+__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+ return bpf_percpu_obj_new(local_type_id__k, meta__ign);
+}
+
/* Must be called under migrate_disable(), as required by bpf_mem_free */
void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
{
@@ -2347,23 +2380,56 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
bpf_mem_free_rcu(ma, p);
}
-__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
+/**
+ * bpf_obj_drop() - drop a previously allocated object
+ * @p__alloc: object to free
+ * @meta: verifier-supplied struct metadata
+ *
+ * Destroy special fields in @p__alloc as needed and free the object. The
+ * verifier rewrites @meta; BPF programs do not set it.
+ */
+__bpf_kfunc void bpf_obj_drop(void *p__alloc, struct btf_struct_meta *meta)
{
- struct btf_struct_meta *meta = meta__ign;
void *p = p__alloc;
__bpf_obj_drop_impl(p, meta ? meta->record : NULL, false);
}
-__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
+__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
{
- /* The verifier has ensured that meta__ign must be NULL */
+ return bpf_obj_drop(p__alloc, meta__ign);
+}
+
+/**
+ * bpf_percpu_obj_drop() - drop a previously allocated percpu object
+ * @p__alloc: percpu object to free
+ * @meta: verifier-supplied struct metadata
+ *
+ * Free @p__alloc. The verifier rewrites @meta; BPF programs do not set it.
+ */
+__bpf_kfunc void bpf_percpu_obj_drop(void *p__alloc, struct btf_struct_meta *meta)
+{
+ /* The verifier has ensured that meta must be NULL */
bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc);
}
-__bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
+__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
+{
+ bpf_percpu_obj_drop(p__alloc, meta__ign);
+}
+
+/**
+ * bpf_refcount_acquire() - turn a local kptr into an owning reference
+ * @p__refcounted_kptr: non-owning local kptr
+ * @meta: verifier-supplied struct metadata
+ *
+ * Increment the refcount for @p__refcounted_kptr. The verifier rewrites
+ * @meta; BPF programs do not set it.
+ *
+ * Return: Owning reference to @p__refcounted_kptr, or %NULL on failure.
+ */
+__bpf_kfunc void *bpf_refcount_acquire(void *p__refcounted_kptr, struct btf_struct_meta *meta)
{
- struct btf_struct_meta *meta = meta__ign;
struct bpf_refcount *ref;
/* Could just cast directly to refcount_t *, but need some code using
@@ -2379,6 +2445,11 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta
return (void *)p__refcounted_kptr;
}
+__bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
+{
+ return bpf_refcount_acquire(p__refcounted_kptr, meta__ign);
+}
+
static int __bpf_list_add(struct bpf_list_node_kern *node,
struct bpf_list_head *head,
bool tail, struct btf_record *rec, u64 off)
@@ -2406,24 +2477,62 @@ static int __bpf_list_add(struct bpf_list_node_kern *node,
return 0;
}
+/**
+ * bpf_list_push_front() - add a node to the front of a BPF linked list
+ * @head: list head
+ * @node: node to insert
+ * @meta: verifier-supplied struct metadata
+ * @off: verifier-supplied offset of @node within the containing object
+ *
+ * Insert @node at the front of @head. The verifier rewrites @meta and @off;
+ * BPF programs do not set them.
+ *
+ * Return: 0 on success, or %-EINVAL if @node is already linked.
+ */
+__bpf_kfunc int bpf_list_push_front(struct bpf_list_head *head,
+ struct bpf_list_node *node,
+ struct btf_struct_meta *meta,
+ u64 off)
+{
+ struct bpf_list_node_kern *n = (void *)node;
+
+ return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off);
+}
+
__bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
struct bpf_list_node *node,
void *meta__ign, u64 off)
{
+ return bpf_list_push_front(head, node, meta__ign, off);
+}
+
+/**
+ * bpf_list_push_back() - add a node to the back of a BPF linked list
+ * @head: list head
+ * @node: node to insert
+ * @meta: verifier-supplied struct metadata
+ * @off: verifier-supplied offset of @node within the containing object
+ *
+ * Insert @node at the back of @head. The verifier rewrites @meta and @off;
+ * BPF programs do not set them.
+ *
+ * Return: 0 on success, or %-EINVAL if @node is already linked.
+ */
+__bpf_kfunc int bpf_list_push_back(struct bpf_list_head *head,
+ struct bpf_list_node *node,
+ struct btf_struct_meta *meta,
+ u64 off)
+{
struct bpf_list_node_kern *n = (void *)node;
- struct btf_struct_meta *meta = meta__ign;
- return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off);
+ return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off);
}
__bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
struct bpf_list_node *node,
void *meta__ign, u64 off)
{
- struct bpf_list_node_kern *n = (void *)node;
- struct btf_struct_meta *meta = meta__ign;
-
- return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off);
+ return bpf_list_push_back(head, node, meta__ign, off);
}
static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
@@ -2535,16 +2644,37 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root,
return 0;
}
-__bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
- bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
- void *meta__ign, u64 off)
+/**
+ * bpf_rbtree_add() - add a node to a BPF rbtree
+ * @root: tree root
+ * @node: node to insert
+ * @less: comparator used to order nodes
+ * @meta: verifier-supplied struct metadata
+ * @off: verifier-supplied offset of @node within the containing object
+ *
+ * Insert @node into @root using @less. The verifier rewrites @meta and @off;
+ * BPF programs do not set them.
+ *
+ * Return: 0 on success, or %-EINVAL if @node is already linked in a tree.
+ */
+__bpf_kfunc int bpf_rbtree_add(struct bpf_rb_root *root,
+ struct bpf_rb_node *node,
+ bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
+ struct btf_struct_meta *meta,
+ u64 off)
{
- struct btf_struct_meta *meta = meta__ign;
struct bpf_rb_node_kern *n = (void *)node;
return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off);
}
+__bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
+ void *meta__ign, u64 off)
+{
+ return bpf_rbtree_add(root, node, less, meta__ign, off);
+}
+
__bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
{
struct rb_root_cached *r = (struct rb_root_cached *)root;
@@ -4165,17 +4295,25 @@ static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx)
return refcount_inc_not_zero(&ctx->refcnt);
}
+static void bpf_task_work_destroy(struct irq_work *irq_work)
+{
+ struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
+
+ bpf_task_work_ctx_reset(ctx);
+ kfree_rcu(ctx, rcu);
+}
+
static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx)
{
if (!refcount_dec_and_test(&ctx->refcnt))
return;
- bpf_task_work_ctx_reset(ctx);
-
- /* bpf_mem_free expects migration to be disabled */
- migrate_disable();
- bpf_mem_free(&bpf_global_ma, ctx);
- migrate_enable();
+ if (irqs_disabled()) {
+ ctx->irq_work = IRQ_WORK_INIT(bpf_task_work_destroy);
+ irq_work_queue(&ctx->irq_work);
+ } else {
+ bpf_task_work_destroy(&ctx->irq_work);
+ }
}
static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx)
@@ -4229,7 +4367,7 @@ static void bpf_task_work_irq(struct irq_work *irq_work)
enum bpf_task_work_state state;
int err;
- guard(rcu_tasks_trace)();
+ guard(rcu)();
if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) {
bpf_task_work_ctx_put(ctx);
@@ -4251,9 +4389,9 @@ static void bpf_task_work_irq(struct irq_work *irq_work)
/*
* It's technically possible for just scheduled task_work callback to
* complete running by now, going SCHEDULING -> RUNNING and then
- * dropping its ctx refcount. Instead of capturing extra ref just to
- * protected below ctx->state access, we rely on RCU protection to
- * perform below SCHEDULING -> SCHEDULED attempt.
+ * dropping its ctx refcount. Instead of capturing an extra ref just
+ * to protect below ctx->state access, we rely on rcu_read_lock
+ * above to prevent kfree_rcu from freeing ctx before we return.
*/
state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED);
if (state == BPF_TW_FREED)
@@ -4270,7 +4408,7 @@ static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *t
if (ctx)
return ctx;
- ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx));
+ ctx = bpf_map_kmalloc_nolock(map, sizeof(*ctx), 0, NUMA_NO_NODE);
if (!ctx)
return ERR_PTR(-ENOMEM);
@@ -4284,7 +4422,7 @@ static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *t
* tw->ctx is set by concurrent BPF program, release allocated
* memory and try to reuse already set context.
*/
- bpf_mem_free(&bpf_global_ma, ctx);
+ kfree_nolock(ctx);
return old_ctx;
}
@@ -4296,13 +4434,23 @@ static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work
{
struct bpf_task_work_ctx *ctx;
- ctx = bpf_task_work_fetch_ctx(tw, map);
- if (IS_ERR(ctx))
- return ctx;
-
- /* try to get ref for task_work callback to hold */
- if (!bpf_task_work_ctx_tryget(ctx))
- return ERR_PTR(-EBUSY);
+ /*
+ * Sleepable BPF programs hold rcu_read_lock_trace but not
+ * regular rcu_read_lock. Since kfree_rcu waits for regular
+ * RCU GP, the ctx can be freed while we're between reading
+ * the pointer and incrementing the refcount. Take regular
+ * rcu_read_lock to prevent kfree_rcu from freeing the ctx
+ * before we can tryget it.
+ */
+ scoped_guard(rcu) {
+ ctx = bpf_task_work_fetch_ctx(tw, map);
+ if (IS_ERR(ctx))
+ return ctx;
+
+ /* try to get ref for task_work callback to hold */
+ if (!bpf_task_work_ctx_tryget(ctx))
+ return ERR_PTR(-EBUSY);
+ }
if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) {
/* lost acquiring race or map_release_uref() stole it from us, put ref and bail */
@@ -4417,7 +4565,7 @@ static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep,
return -EINVAL;
}
- state = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_dynptr_file_impl));
+ state = kmalloc_nolock(sizeof(*state), 0, NUMA_NO_NODE);
if (!state) {
bpf_dynptr_set_null(ptr);
return -ENOMEM;
@@ -4449,7 +4597,7 @@ __bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr)
return 0;
freader_cleanup(&df->freader);
- bpf_mem_free(&bpf_global_ma, df);
+ kfree_nolock(df);
bpf_dynptr_set_null(ptr);
return 0;
}
@@ -4536,12 +4684,19 @@ BTF_KFUNCS_START(generic_btf_ids)
#ifdef CONFIG_CRASH_DUMP
BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
#endif
+BTF_ID_FLAGS(func, bpf_obj_new, KF_ACQUIRE | KF_RET_NULL | KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_percpu_obj_new, KF_ACQUIRE | KF_RET_NULL | KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_obj_drop, KF_RELEASE | KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_percpu_obj_drop, KF_RELEASE | KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_refcount_acquire, KF_ACQUIRE | KF_RET_NULL | KF_RCU | KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
+BTF_ID_FLAGS(func, bpf_list_push_front, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_list_push_front_impl)
+BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_list_push_back_impl)
BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
@@ -4550,6 +4705,7 @@ BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_add, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL)
@@ -4578,6 +4734,9 @@ BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
#endif
#endif
+#ifdef CONFIG_S390
+BTF_ID_FLAGS(func, bpf_get_lowcore)
+#endif
BTF_KFUNCS_END(generic_btf_ids)
static const struct btf_kfunc_id_set generic_kfunc_set = {
diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c
index 998986853c61..1fb4c511db5a 100644
--- a/kernel/bpf/liveness.c
+++ b/kernel/bpf/liveness.c
@@ -2,217 +2,119 @@
/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
#include <linux/bpf_verifier.h>
+#include <linux/btf.h>
#include <linux/hashtable.h>
#include <linux/jhash.h>
#include <linux/slab.h>
+#include <linux/sort.h>
-/*
- * This file implements live stack slots analysis. After accumulating
- * stack usage data, the analysis answers queries about whether a
- * particular stack slot may be read by an instruction or any of it's
- * successors. This data is consumed by the verifier states caching
- * mechanism to decide which stack slots are important when looking for a
- * visited state corresponding to the current state.
- *
- * The analysis is call chain sensitive, meaning that data is collected
- * and queried for tuples (call chain, subprogram instruction index).
- * Such sensitivity allows identifying if some subprogram call always
- * leads to writes in the caller's stack.
- *
- * The basic idea is as follows:
- * - As the verifier accumulates a set of visited states, the analysis instance
- * accumulates a conservative estimate of stack slots that can be read
- * or must be written for each visited tuple (call chain, instruction index).
- * - If several states happen to visit the same instruction with the same
- * call chain, stack usage information for the corresponding tuple is joined:
- * - "may_read" set represents a union of all possibly read slots
- * (any slot in "may_read" set might be read at or after the instruction);
- * - "must_write" set represents an intersection of all possibly written slots
- * (any slot in "must_write" set is guaranteed to be written by the instruction).
- * - The analysis is split into two phases:
- * - read and write marks accumulation;
- * - read and write marks propagation.
- * - The propagation phase is a textbook live variable data flow analysis:
- *
- * state[cc, i].live_after = U [state[cc, s].live_before for s in bpf_insn_successors(i)]
- * state[cc, i].live_before =
- * (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read
- *
- * Where:
- * - `U` stands for set union
- * - `/` stands for set difference;
- * - `cc` stands for a call chain;
- * - `i` and `s` are instruction indexes;
- *
- * The above equations are computed for each call chain and instruction
- * index until state stops changing.
- * - Additionally, in order to transfer "must_write" information from a
- * subprogram to call instructions invoking this subprogram,
- * the "must_write_acc" set is tracked for each (cc, i) tuple.
- * A set of stack slots that are guaranteed to be written by this
- * instruction or any of its successors (within the subprogram).
- * The equation for "must_write_acc" propagation looks as follows:
- *
- * state[cc, i].must_write_acc =
- * ∩ [state[cc, s].must_write_acc for s in bpf_insn_successors(i)]
- * U state[cc, i].must_write
- *
- * (An intersection of all "must_write_acc" for instruction successors
- * plus all "must_write" slots for the instruction itself).
- * - After the propagation phase completes for a subprogram, information from
- * (cc, 0) tuple (subprogram entry) is transferred to the caller's call chain:
- * - "must_write_acc" set is intersected with the call site's "must_write" set;
- * - "may_read" set is added to the call site's "may_read" set.
- * - Any live stack queries must be taken after the propagation phase.
- * - Accumulation and propagation phases can be entered multiple times,
- * at any point in time:
- * - "may_read" set only grows;
- * - "must_write" set only shrinks;
- * - for each visited verifier state with zero branches, all relevant
- * read and write marks are already recorded by the analysis instance.
- *
- * Technically, the analysis is facilitated by the following data structures:
- * - Call chain: for given verifier state, the call chain is a tuple of call
- * instruction indexes leading to the current subprogram plus the subprogram
- * entry point index.
- * - Function instance: for a given call chain, for each instruction in
- * the current subprogram, a mapping between instruction index and a
- * set of "may_read", "must_write" and other marks accumulated for this
- * instruction.
- * - A hash table mapping call chains to function instances.
- */
-
-struct callchain {
- u32 callsites[MAX_CALL_FRAMES]; /* instruction pointer for each frame */
- /* cached subprog_info[*].start for functions owning the frames:
- * - sp_starts[curframe] used to get insn relative index within current function;
- * - sp_starts[0..current-1] used for fast callchain_frame_up().
- */
- u32 sp_starts[MAX_CALL_FRAMES];
- u32 curframe; /* depth of callsites and sp_starts arrays */
-};
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
struct per_frame_masks {
- u64 may_read; /* stack slots that may be read by this instruction */
- u64 must_write; /* stack slots written by this instruction */
- u64 must_write_acc; /* stack slots written by this instruction and its successors */
- u64 live_before; /* stack slots that may be read by this insn and its successors */
+ spis_t may_read; /* stack slots that may be read by this instruction */
+ spis_t must_write; /* stack slots written by this instruction */
+ spis_t live_before; /* stack slots that may be read by this insn and its successors */
};
/*
- * A function instance created for a specific callchain.
+ * A function instance keyed by (callsite, depth).
* Encapsulates read and write marks for each instruction in the function.
- * Marks are tracked for each frame in the callchain.
+ * Marks are tracked for each frame up to @depth.
*/
struct func_instance {
struct hlist_node hl_node;
- struct callchain callchain;
+ u32 callsite; /* call insn that invoked this subprog (subprog_start for depth 0) */
+ u32 depth; /* call depth (0 = entry subprog) */
+ u32 subprog; /* subprog index */
+ u32 subprog_start; /* cached env->subprog_info[subprog].start */
u32 insn_cnt; /* cached number of insns in the function */
- bool updated;
- bool must_write_dropped;
/* Per frame, per instruction masks, frames allocated lazily. */
struct per_frame_masks *frames[MAX_CALL_FRAMES];
- /* For each instruction a flag telling if "must_write" had been initialized for it. */
- bool *must_write_set;
+ bool must_write_initialized;
};
struct live_stack_query {
struct func_instance *instances[MAX_CALL_FRAMES]; /* valid in range [0..curframe] */
+ u32 callsites[MAX_CALL_FRAMES]; /* callsite[i] = insn calling frame i+1 */
u32 curframe;
u32 insn_idx;
};
struct bpf_liveness {
- DECLARE_HASHTABLE(func_instances, 8); /* maps callchain to func_instance */
+ DECLARE_HASHTABLE(func_instances, 8); /* maps (depth, callsite) to func_instance */
struct live_stack_query live_stack_query; /* cache to avoid repetitive ht lookups */
- /* Cached instance corresponding to env->cur_state, avoids per-instruction ht lookup */
- struct func_instance *cur_instance;
- /*
- * Below fields are used to accumulate stack write marks for instruction at
- * @write_insn_idx before submitting the marks to @cur_instance.
- */
- u64 write_masks_acc[MAX_CALL_FRAMES];
- u32 write_insn_idx;
+ u32 subprog_calls; /* analyze_subprog() invocations */
};
-/* Compute callchain corresponding to state @st at depth @frameno */
-static void compute_callchain(struct bpf_verifier_env *env, struct bpf_verifier_state *st,
- struct callchain *callchain, u32 frameno)
+/*
+ * Hash/compare key for func_instance: (depth, callsite).
+ * For depth == 0 (entry subprog), @callsite is the subprog start insn.
+ * For depth > 0, @callsite is the call instruction index that invoked the subprog.
+ */
+static u32 instance_hash(u32 callsite, u32 depth)
{
- struct bpf_subprog_info *subprog_info = env->subprog_info;
- u32 i;
+ u32 key[2] = { depth, callsite };
- memset(callchain, 0, sizeof(*callchain));
- for (i = 0; i <= frameno; i++) {
- callchain->sp_starts[i] = subprog_info[st->frame[i]->subprogno].start;
- if (i < st->curframe)
- callchain->callsites[i] = st->frame[i + 1]->callsite;
- }
- callchain->curframe = frameno;
- callchain->callsites[callchain->curframe] = callchain->sp_starts[callchain->curframe];
-}
-
-static u32 hash_callchain(struct callchain *callchain)
-{
- return jhash2(callchain->callsites, callchain->curframe, 0);
+ return jhash2(key, 2, 0);
}
-static bool same_callsites(struct callchain *a, struct callchain *b)
+static struct func_instance *find_instance(struct bpf_verifier_env *env,
+ u32 callsite, u32 depth)
{
- int i;
+ struct bpf_liveness *liveness = env->liveness;
+ struct func_instance *f;
+ u32 key = instance_hash(callsite, depth);
- if (a->curframe != b->curframe)
- return false;
- for (i = a->curframe; i >= 0; i--)
- if (a->callsites[i] != b->callsites[i])
- return false;
- return true;
+ hash_for_each_possible(liveness->func_instances, f, hl_node, key)
+ if (f->depth == depth && f->callsite == callsite)
+ return f;
+ return NULL;
}
-/*
- * Find existing or allocate new function instance corresponding to @callchain.
- * Instances are accumulated in env->liveness->func_instances and persist
- * until the end of the verification process.
- */
-static struct func_instance *__lookup_instance(struct bpf_verifier_env *env,
- struct callchain *callchain)
+static struct func_instance *call_instance(struct bpf_verifier_env *env,
+ struct func_instance *caller,
+ u32 callsite, int subprog)
{
- struct bpf_liveness *liveness = env->liveness;
- struct bpf_subprog_info *subprog;
- struct func_instance *result;
- u32 subprog_sz, size, key;
-
- key = hash_callchain(callchain);
- hash_for_each_possible(liveness->func_instances, result, hl_node, key)
- if (same_callsites(&result->callchain, callchain))
- return result;
-
- subprog = bpf_find_containing_subprog(env, callchain->sp_starts[callchain->curframe]);
- subprog_sz = (subprog + 1)->start - subprog->start;
- size = sizeof(struct func_instance);
- result = kvzalloc(size, GFP_KERNEL_ACCOUNT);
- if (!result)
+ u32 depth = caller ? caller->depth + 1 : 0;
+ u32 subprog_start = env->subprog_info[subprog].start;
+ u32 lookup_key = depth > 0 ? callsite : subprog_start;
+ struct func_instance *f;
+ u32 hash;
+
+ f = find_instance(env, lookup_key, depth);
+ if (f)
+ return f;
+
+ f = kvzalloc(sizeof(*f), GFP_KERNEL_ACCOUNT);
+ if (!f)
return ERR_PTR(-ENOMEM);
- result->must_write_set = kvzalloc_objs(*result->must_write_set,
- subprog_sz, GFP_KERNEL_ACCOUNT);
- if (!result->must_write_set) {
- kvfree(result);
- return ERR_PTR(-ENOMEM);
- }
- memcpy(&result->callchain, callchain, sizeof(*callchain));
- result->insn_cnt = subprog_sz;
- hash_add(liveness->func_instances, &result->hl_node, key);
- return result;
+ f->callsite = lookup_key;
+ f->depth = depth;
+ f->subprog = subprog;
+ f->subprog_start = subprog_start;
+ f->insn_cnt = (env->subprog_info + subprog + 1)->start - subprog_start;
+ hash = instance_hash(lookup_key, depth);
+ hash_add(env->liveness->func_instances, &f->hl_node, hash);
+ return f;
}
static struct func_instance *lookup_instance(struct bpf_verifier_env *env,
struct bpf_verifier_state *st,
u32 frameno)
{
- struct callchain callchain;
-
- compute_callchain(env, st, &callchain, frameno);
- return __lookup_instance(env, &callchain);
+ u32 callsite, subprog_start;
+ struct func_instance *f;
+ u32 key, depth;
+
+ subprog_start = env->subprog_info[st->frame[frameno]->subprogno].start;
+ callsite = frameno > 0 ? st->frame[frameno]->callsite : subprog_start;
+
+ for (depth = frameno; ; depth--) {
+ key = depth > 0 ? callsite : subprog_start;
+ f = find_instance(env, key, depth);
+ if (f || depth == 0)
+ return f;
+ }
}
int bpf_stack_liveness_init(struct bpf_verifier_env *env)
@@ -233,9 +135,8 @@ void bpf_stack_liveness_free(struct bpf_verifier_env *env)
if (!env->liveness)
return;
hash_for_each_safe(env->liveness->func_instances, bkt, tmp, instance, hl_node) {
- for (i = 0; i <= instance->callchain.curframe; i++)
+ for (i = 0; i <= instance->depth; i++)
kvfree(instance->frames[i]);
- kvfree(instance->must_write_set);
kvfree(instance);
}
kvfree(env->liveness);
@@ -247,7 +148,7 @@ void bpf_stack_liveness_free(struct bpf_verifier_env *env)
*/
static int relative_idx(struct func_instance *instance, u32 insn_idx)
{
- return insn_idx - instance->callchain.sp_starts[instance->callchain.curframe];
+ return insn_idx - instance->subprog_start;
}
static struct per_frame_masks *get_frame_masks(struct func_instance *instance,
@@ -259,8 +160,7 @@ static struct per_frame_masks *get_frame_masks(struct func_instance *instance,
return &instance->frames[frame][relative_idx(instance, insn_idx)];
}
-static struct per_frame_masks *alloc_frame_masks(struct bpf_verifier_env *env,
- struct func_instance *instance,
+static struct per_frame_masks *alloc_frame_masks(struct func_instance *instance,
u32 frame, u32 insn_idx)
{
struct per_frame_masks *arr;
@@ -275,167 +175,29 @@ static struct per_frame_masks *alloc_frame_masks(struct bpf_verifier_env *env,
return get_frame_masks(instance, frame, insn_idx);
}
-void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env)
-{
- env->liveness->cur_instance = NULL;
-}
-
-/* If @env->liveness->cur_instance is null, set it to instance corresponding to @env->cur_state. */
-static int ensure_cur_instance(struct bpf_verifier_env *env)
-{
- struct bpf_liveness *liveness = env->liveness;
- struct func_instance *instance;
-
- if (liveness->cur_instance)
- return 0;
-
- instance = lookup_instance(env, env->cur_state, env->cur_state->curframe);
- if (IS_ERR(instance))
- return PTR_ERR(instance);
-
- liveness->cur_instance = instance;
- return 0;
-}
-
/* Accumulate may_read masks for @frame at @insn_idx */
-static int mark_stack_read(struct bpf_verifier_env *env,
- struct func_instance *instance, u32 frame, u32 insn_idx, u64 mask)
+static int mark_stack_read(struct func_instance *instance, u32 frame, u32 insn_idx, spis_t mask)
{
struct per_frame_masks *masks;
- u64 new_may_read;
- masks = alloc_frame_masks(env, instance, frame, insn_idx);
+ masks = alloc_frame_masks(instance, frame, insn_idx);
if (IS_ERR(masks))
return PTR_ERR(masks);
- new_may_read = masks->may_read | mask;
- if (new_may_read != masks->may_read &&
- ((new_may_read | masks->live_before) != masks->live_before))
- instance->updated = true;
- masks->may_read |= mask;
- return 0;
-}
-
-int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frame, u32 insn_idx, u64 mask)
-{
- int err;
-
- err = ensure_cur_instance(env);
- err = err ?: mark_stack_read(env, env->liveness->cur_instance, frame, insn_idx, mask);
- return err;
-}
-
-static void reset_stack_write_marks(struct bpf_verifier_env *env,
- struct func_instance *instance, u32 insn_idx)
-{
- struct bpf_liveness *liveness = env->liveness;
- int i;
-
- liveness->write_insn_idx = insn_idx;
- for (i = 0; i <= instance->callchain.curframe; i++)
- liveness->write_masks_acc[i] = 0;
-}
-
-int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx)
-{
- struct bpf_liveness *liveness = env->liveness;
- int err;
-
- err = ensure_cur_instance(env);
- if (err)
- return err;
-
- reset_stack_write_marks(env, liveness->cur_instance, insn_idx);
+ masks->may_read = spis_or(masks->may_read, mask);
return 0;
}
-void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frame, u64 mask)
+static int mark_stack_write(struct func_instance *instance, u32 frame, u32 insn_idx, spis_t mask)
{
- env->liveness->write_masks_acc[frame] |= mask;
-}
-
-static int commit_stack_write_marks(struct bpf_verifier_env *env,
- struct func_instance *instance)
-{
- struct bpf_liveness *liveness = env->liveness;
- u32 idx, frame, curframe, old_must_write;
struct per_frame_masks *masks;
- u64 mask;
- if (!instance)
- return 0;
-
- curframe = instance->callchain.curframe;
- idx = relative_idx(instance, liveness->write_insn_idx);
- for (frame = 0; frame <= curframe; frame++) {
- mask = liveness->write_masks_acc[frame];
- /* avoid allocating frames for zero masks */
- if (mask == 0 && !instance->must_write_set[idx])
- continue;
- masks = alloc_frame_masks(env, instance, frame, liveness->write_insn_idx);
- if (IS_ERR(masks))
- return PTR_ERR(masks);
- old_must_write = masks->must_write;
- /*
- * If instruction at this callchain is seen for a first time, set must_write equal
- * to @mask. Otherwise take intersection with the previous value.
- */
- if (instance->must_write_set[idx])
- mask &= old_must_write;
- if (old_must_write != mask) {
- masks->must_write = mask;
- instance->updated = true;
- }
- if (old_must_write & ~mask)
- instance->must_write_dropped = true;
- }
- instance->must_write_set[idx] = true;
- liveness->write_insn_idx = 0;
+ masks = alloc_frame_masks(instance, frame, insn_idx);
+ if (IS_ERR(masks))
+ return PTR_ERR(masks);
+ masks->must_write = spis_or(masks->must_write, mask);
return 0;
}
-/*
- * Merge stack writes marks in @env->liveness->write_masks_acc
- * with information already in @env->liveness->cur_instance.
- */
-int bpf_commit_stack_write_marks(struct bpf_verifier_env *env)
-{
- return commit_stack_write_marks(env, env->liveness->cur_instance);
-}
-
-static char *fmt_callchain(struct bpf_verifier_env *env, struct callchain *callchain)
-{
- char *buf_end = env->tmp_str_buf + sizeof(env->tmp_str_buf);
- char *buf = env->tmp_str_buf;
- int i;
-
- buf += snprintf(buf, buf_end - buf, "(");
- for (i = 0; i <= callchain->curframe; i++)
- buf += snprintf(buf, buf_end - buf, "%s%d", i ? "," : "", callchain->callsites[i]);
- snprintf(buf, buf_end - buf, ")");
- return env->tmp_str_buf;
-}
-
-static void log_mask_change(struct bpf_verifier_env *env, struct callchain *callchain,
- char *pfx, u32 frame, u32 insn_idx, u64 old, u64 new)
-{
- u64 changed_bits = old ^ new;
- u64 new_ones = new & changed_bits;
- u64 new_zeros = ~new & changed_bits;
-
- if (!changed_bits)
- return;
- bpf_log(&env->log, "%s frame %d insn %d ", fmt_callchain(env, callchain), frame, insn_idx);
- if (new_ones) {
- bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_ones);
- bpf_log(&env->log, "+%s %s ", pfx, env->tmp_str_buf);
- }
- if (new_zeros) {
- bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_zeros);
- bpf_log(&env->log, "-%s %s", pfx, env->tmp_str_buf);
- }
- bpf_log(&env->log, "\n");
-}
-
int bpf_jmp_offset(struct bpf_insn *insn)
{
u8 code = insn->code;
@@ -507,62 +269,11 @@ bpf_insn_successors(struct bpf_verifier_env *env, u32 idx)
__diag_pop();
-static struct func_instance *get_outer_instance(struct bpf_verifier_env *env,
- struct func_instance *instance)
-{
- struct callchain callchain = instance->callchain;
-
- /* Adjust @callchain to represent callchain one frame up */
- callchain.callsites[callchain.curframe] = 0;
- callchain.sp_starts[callchain.curframe] = 0;
- callchain.curframe--;
- callchain.callsites[callchain.curframe] = callchain.sp_starts[callchain.curframe];
- return __lookup_instance(env, &callchain);
-}
-
-static u32 callchain_subprog_start(struct callchain *callchain)
-{
- return callchain->sp_starts[callchain->curframe];
-}
-
-/*
- * Transfer @may_read and @must_write_acc marks from the first instruction of @instance,
- * to the call instruction in function instance calling @instance.
- */
-static int propagate_to_outer_instance(struct bpf_verifier_env *env,
- struct func_instance *instance)
-{
- struct callchain *callchain = &instance->callchain;
- u32 this_subprog_start, callsite, frame;
- struct func_instance *outer_instance;
- struct per_frame_masks *insn;
- int err;
-
- this_subprog_start = callchain_subprog_start(callchain);
- outer_instance = get_outer_instance(env, instance);
- if (IS_ERR(outer_instance))
- return PTR_ERR(outer_instance);
- callsite = callchain->callsites[callchain->curframe - 1];
-
- reset_stack_write_marks(env, outer_instance, callsite);
- for (frame = 0; frame < callchain->curframe; frame++) {
- insn = get_frame_masks(instance, frame, this_subprog_start);
- if (!insn)
- continue;
- bpf_mark_stack_write(env, frame, insn->must_write_acc);
- err = mark_stack_read(env, outer_instance, frame, callsite, insn->live_before);
- if (err)
- return err;
- }
- commit_stack_write_marks(env, outer_instance);
- return 0;
-}
static inline bool update_insn(struct bpf_verifier_env *env,
struct func_instance *instance, u32 frame, u32 insn_idx)
{
- struct bpf_insn_aux_data *aux = env->insn_aux_data;
- u64 new_before, new_after, must_write_acc;
+ spis_t new_before, new_after;
struct per_frame_masks *insn, *succ_insn;
struct bpf_iarray *succ;
u32 s;
@@ -574,77 +285,40 @@ static inline bool update_insn(struct bpf_verifier_env *env,
changed = false;
insn = get_frame_masks(instance, frame, insn_idx);
- new_before = 0;
- new_after = 0;
- /*
- * New "must_write_acc" is an intersection of all "must_write_acc"
- * of successors plus all "must_write" slots of instruction itself.
- */
- must_write_acc = U64_MAX;
+ new_before = SPIS_ZERO;
+ new_after = SPIS_ZERO;
for (s = 0; s < succ->cnt; ++s) {
succ_insn = get_frame_masks(instance, frame, succ->items[s]);
- new_after |= succ_insn->live_before;
- must_write_acc &= succ_insn->must_write_acc;
+ new_after = spis_or(new_after, succ_insn->live_before);
}
- must_write_acc |= insn->must_write;
/*
* New "live_before" is a union of all "live_before" of successors
* minus slots written by instruction plus slots read by instruction.
+ * new_before = (new_after & ~insn->must_write) | insn->may_read
*/
- new_before = (new_after & ~insn->must_write) | insn->may_read;
- changed |= new_before != insn->live_before;
- changed |= must_write_acc != insn->must_write_acc;
- if (unlikely(env->log.level & BPF_LOG_LEVEL2) &&
- (insn->may_read || insn->must_write ||
- insn_idx == callchain_subprog_start(&instance->callchain) ||
- aux[insn_idx].prune_point)) {
- log_mask_change(env, &instance->callchain, "live",
- frame, insn_idx, insn->live_before, new_before);
- log_mask_change(env, &instance->callchain, "written",
- frame, insn_idx, insn->must_write_acc, must_write_acc);
- }
+ new_before = spis_or(spis_and(new_after, spis_not(insn->must_write)),
+ insn->may_read);
+ changed |= !spis_equal(new_before, insn->live_before);
insn->live_before = new_before;
- insn->must_write_acc = must_write_acc;
return changed;
}
-/* Fixed-point computation of @live_before and @must_write_acc marks */
-static int update_instance(struct bpf_verifier_env *env, struct func_instance *instance)
+/* Fixed-point computation of @live_before marks */
+static void update_instance(struct bpf_verifier_env *env, struct func_instance *instance)
{
- u32 i, frame, po_start, po_end, cnt, this_subprog_start;
- struct callchain *callchain = &instance->callchain;
+ u32 i, frame, po_start, po_end;
int *insn_postorder = env->cfg.insn_postorder;
struct bpf_subprog_info *subprog;
- struct per_frame_masks *insn;
bool changed;
- int err;
-
- this_subprog_start = callchain_subprog_start(callchain);
- /*
- * If must_write marks were updated must_write_acc needs to be reset
- * (to account for the case when new must_write sets became smaller).
- */
- if (instance->must_write_dropped) {
- for (frame = 0; frame <= callchain->curframe; frame++) {
- if (!instance->frames[frame])
- continue;
-
- for (i = 0; i < instance->insn_cnt; i++) {
- insn = get_frame_masks(instance, frame, this_subprog_start + i);
- insn->must_write_acc = 0;
- }
- }
- }
- subprog = bpf_find_containing_subprog(env, this_subprog_start);
+ instance->must_write_initialized = true;
+ subprog = &env->subprog_info[instance->subprog];
po_start = subprog->postorder_start;
po_end = (subprog + 1)->postorder_start;
- cnt = 0;
/* repeat until fixed point is reached */
do {
- cnt++;
changed = false;
- for (frame = 0; frame <= instance->callchain.curframe; frame++) {
+ for (frame = 0; frame <= instance->depth; frame++) {
if (!instance->frames[frame])
continue;
@@ -652,57 +326,14 @@ static int update_instance(struct bpf_verifier_env *env, struct func_instance *i
changed |= update_insn(env, instance, frame, insn_postorder[i]);
}
} while (changed);
-
- if (env->log.level & BPF_LOG_LEVEL2)
- bpf_log(&env->log, "%s live stack update done in %d iterations\n",
- fmt_callchain(env, callchain), cnt);
-
- /* transfer marks accumulated for outer frames to outer func instance (caller) */
- if (callchain->curframe > 0) {
- err = propagate_to_outer_instance(env, instance);
- if (err)
- return err;
- }
-
- return 0;
-}
-
-/*
- * Prepare all callchains within @env->cur_state for querying.
- * This function should be called after each verifier.c:pop_stack()
- * and whenever verifier.c:do_check_insn() processes subprogram exit.
- * This would guarantee that visited verifier states with zero branches
- * have their bpf_mark_stack_{read,write}() effects propagated in
- * @env->liveness.
- */
-int bpf_update_live_stack(struct bpf_verifier_env *env)
-{
- struct func_instance *instance;
- int err, frame;
-
- bpf_reset_live_stack_callchain(env);
- for (frame = env->cur_state->curframe; frame >= 0; --frame) {
- instance = lookup_instance(env, env->cur_state, frame);
- if (IS_ERR(instance))
- return PTR_ERR(instance);
-
- if (instance->updated) {
- err = update_instance(env, instance);
- if (err)
- return err;
- instance->updated = false;
- instance->must_write_dropped = false;
- }
- }
- return 0;
}
-static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 spi)
+static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 half_spi)
{
struct per_frame_masks *masks;
masks = get_frame_masks(instance, frameno, insn_idx);
- return masks && (masks->live_before & BIT(spi));
+ return masks && spis_test_bit(masks->live_before, half_spi);
}
int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
@@ -714,41 +345,1858 @@ int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_
memset(q, 0, sizeof(*q));
for (frame = 0; frame <= st->curframe; frame++) {
instance = lookup_instance(env, st, frame);
- if (IS_ERR(instance))
- return PTR_ERR(instance);
- q->instances[frame] = instance;
+ if (IS_ERR_OR_NULL(instance))
+ q->instances[frame] = NULL;
+ else
+ q->instances[frame] = instance;
+ if (frame < st->curframe)
+ q->callsites[frame] = st->frame[frame + 1]->callsite;
}
q->curframe = st->curframe;
q->insn_idx = st->insn_idx;
return 0;
}
-bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi)
+bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 half_spi)
{
/*
- * Slot is alive if it is read before q->st->insn_idx in current func instance,
+ * Slot is alive if it is read before q->insn_idx in current func instance,
* or if for some outer func instance:
* - alive before callsite if callsite calls callback, otherwise
* - alive after callsite
*/
struct live_stack_query *q = &env->liveness->live_stack_query;
struct func_instance *instance, *curframe_instance;
- u32 i, callsite;
- bool alive;
+ u32 i, callsite, rel;
+ int cur_delta, delta;
+ bool alive = false;
curframe_instance = q->instances[q->curframe];
- if (is_live_before(curframe_instance, q->insn_idx, frameno, spi))
+ if (!curframe_instance)
+ return true;
+ cur_delta = (int)curframe_instance->depth - (int)q->curframe;
+ rel = frameno + cur_delta;
+ if (rel <= curframe_instance->depth)
+ alive = is_live_before(curframe_instance, q->insn_idx, rel, half_spi);
+
+ if (alive)
return true;
for (i = frameno; i < q->curframe; i++) {
- callsite = curframe_instance->callchain.callsites[i];
instance = q->instances[i];
+ if (!instance)
+ return true;
+ /* Map actual frameno to frame index within this instance */
+ delta = (int)instance->depth - (int)i;
+ rel = frameno + delta;
+ if (rel > instance->depth)
+ return true;
+
+ /* Get callsite from verifier state, not from instance callchain */
+ callsite = q->callsites[i];
+
alive = bpf_calls_callback(env, callsite)
- ? is_live_before(instance, callsite, frameno, spi)
- : is_live_before(instance, callsite + 1, frameno, spi);
+ ? is_live_before(instance, callsite, rel, half_spi)
+ : is_live_before(instance, callsite + 1, rel, half_spi);
if (alive)
return true;
}
return false;
}
+
+static char *fmt_subprog(struct bpf_verifier_env *env, int subprog)
+{
+ const char *name = env->subprog_info[subprog].name;
+
+ snprintf(env->tmp_str_buf, sizeof(env->tmp_str_buf),
+ "subprog#%d%s%s", subprog, name ? " " : "", name ? name : "");
+ return env->tmp_str_buf;
+}
+
+static char *fmt_instance(struct bpf_verifier_env *env, struct func_instance *instance)
+{
+ snprintf(env->tmp_str_buf, sizeof(env->tmp_str_buf),
+ "(d%d,cs%d)", instance->depth, instance->callsite);
+ return env->tmp_str_buf;
+}
+
+static int spi_off(int spi)
+{
+ return -(spi + 1) * BPF_REG_SIZE;
+}
+
+/*
+ * When both halves of an 8-byte SPI are set, print as "-8","-16",...
+ * When only one half is set, print as "-4h","-8h",...
+ * Runs of 3+ consecutive fully-set SPIs are collapsed: "fp0-8..-24"
+ */
+static char *fmt_spis_mask(struct bpf_verifier_env *env, int frame, bool first, spis_t spis)
+{
+ int buf_sz = sizeof(env->tmp_str_buf);
+ char *buf = env->tmp_str_buf;
+ int spi, n, run_start;
+
+ buf[0] = '\0';
+
+ for (spi = 0; spi < STACK_SLOTS / 2 && buf_sz > 0; spi++) {
+ bool lo = spis_test_bit(spis, spi * 2);
+ bool hi = spis_test_bit(spis, spi * 2 + 1);
+ const char *space = first ? "" : " ";
+
+ if (!lo && !hi)
+ continue;
+
+ if (!lo || !hi) {
+ /* half-spi */
+ n = scnprintf(buf, buf_sz, "%sfp%d%d%s",
+ space, frame, spi_off(spi) + (lo ? STACK_SLOT_SZ : 0), "h");
+ } else if (spi + 2 < STACK_SLOTS / 2 &&
+ spis_test_bit(spis, spi * 2 + 2) &&
+ spis_test_bit(spis, spi * 2 + 3) &&
+ spis_test_bit(spis, spi * 2 + 4) &&
+ spis_test_bit(spis, spi * 2 + 5)) {
+ /* 3+ consecutive full spis */
+ run_start = spi;
+ while (spi + 1 < STACK_SLOTS / 2 &&
+ spis_test_bit(spis, (spi + 1) * 2) &&
+ spis_test_bit(spis, (spi + 1) * 2 + 1))
+ spi++;
+ n = scnprintf(buf, buf_sz, "%sfp%d%d..%d",
+ space, frame, spi_off(run_start), spi_off(spi));
+ } else {
+ /* just a full spi */
+ n = scnprintf(buf, buf_sz, "%sfp%d%d", space, frame, spi_off(spi));
+ }
+ first = false;
+ buf += n;
+ buf_sz -= n;
+ }
+ return env->tmp_str_buf;
+}
+
+static void print_instance(struct bpf_verifier_env *env, struct func_instance *instance)
+{
+ int start = env->subprog_info[instance->subprog].start;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct per_frame_masks *masks;
+ int len = instance->insn_cnt;
+ int insn_idx, frame, i;
+ bool has_use, has_def;
+ u64 pos, insn_pos;
+
+ if (!(env->log.level & BPF_LOG_LEVEL2))
+ return;
+
+ verbose(env, "stack use/def %s ", fmt_subprog(env, instance->subprog));
+ verbose(env, "%s:\n", fmt_instance(env, instance));
+ for (i = 0; i < len; i++) {
+ insn_idx = start + i;
+ has_use = false;
+ has_def = false;
+ pos = env->log.end_pos;
+ verbose(env, "%3d: ", insn_idx);
+ bpf_verbose_insn(env, &insns[insn_idx]);
+ bpf_vlog_reset(&env->log, env->log.end_pos - 1); /* remove \n */
+ insn_pos = env->log.end_pos;
+ verbose(env, "%*c;", bpf_vlog_alignment(insn_pos - pos), ' ');
+ pos = env->log.end_pos;
+ verbose(env, " use: ");
+ for (frame = instance->depth; frame >= 0; --frame) {
+ masks = get_frame_masks(instance, frame, insn_idx);
+ if (!masks || spis_is_zero(masks->may_read))
+ continue;
+ verbose(env, "%s", fmt_spis_mask(env, frame, !has_use, masks->may_read));
+ has_use = true;
+ }
+ if (!has_use)
+ bpf_vlog_reset(&env->log, pos);
+ pos = env->log.end_pos;
+ verbose(env, " def: ");
+ for (frame = instance->depth; frame >= 0; --frame) {
+ masks = get_frame_masks(instance, frame, insn_idx);
+ if (!masks || spis_is_zero(masks->must_write))
+ continue;
+ verbose(env, "%s", fmt_spis_mask(env, frame, !has_def, masks->must_write));
+ has_def = true;
+ }
+ if (!has_def)
+ bpf_vlog_reset(&env->log, has_use ? pos : insn_pos);
+ verbose(env, "\n");
+ if (bpf_is_ldimm64(&insns[insn_idx]))
+ i++;
+ }
+}
+
+static int cmp_instances(const void *pa, const void *pb)
+{
+ struct func_instance *a = *(struct func_instance **)pa;
+ struct func_instance *b = *(struct func_instance **)pb;
+ int dcallsite = (int)a->callsite - b->callsite;
+ int ddepth = (int)a->depth - b->depth;
+
+ if (dcallsite)
+ return dcallsite;
+ if (ddepth)
+ return ddepth;
+ return 0;
+}
+
+/* print use/def slots for all instances ordered by callsite first, then by depth */
+static int print_instances(struct bpf_verifier_env *env)
+{
+ struct func_instance *instance, **sorted_instances;
+ struct bpf_liveness *liveness = env->liveness;
+ int i, bkt, cnt;
+
+ cnt = 0;
+ hash_for_each(liveness->func_instances, bkt, instance, hl_node)
+ cnt++;
+ sorted_instances = kvmalloc_objs(*sorted_instances, cnt, GFP_KERNEL_ACCOUNT);
+ if (!sorted_instances)
+ return -ENOMEM;
+ cnt = 0;
+ hash_for_each(liveness->func_instances, bkt, instance, hl_node)
+ sorted_instances[cnt++] = instance;
+ sort(sorted_instances, cnt, sizeof(*sorted_instances), cmp_instances, NULL);
+ for (i = 0; i < cnt; i++)
+ print_instance(env, sorted_instances[i]);
+ kvfree(sorted_instances);
+ return 0;
+}
+
+/*
+ * Per-register tracking state for compute_subprog_args().
+ * Tracks which frame's FP a value is derived from
+ * and the byte offset from that frame's FP.
+ *
+ * The .frame field forms a lattice with three levels of precision:
+ *
+ * precise {frame=N, off=V} -- known absolute frame index and byte offset
+ * |
+ * offset-imprecise {frame=N, off=OFF_IMPRECISE}
+ * | -- known frame identity, unknown offset
+ * fully-imprecise {frame=ARG_IMPRECISE, mask=bitmask}
+ * -- unknown frame identity; .mask is a
+ * bitmask of which frame indices might be
+ * involved
+ *
+ * At CFG merge points, arg_track_join() moves down the lattice:
+ * - same frame + same offset -> precise
+ * - same frame + different offset -> offset-imprecise
+ * - different frames -> fully-imprecise (bitmask OR)
+ *
+ * At memory access sites (LDX/STX/ST), offset-imprecise marks only
+ * the known frame's access mask as SPIS_ALL, while fully-imprecise
+ * iterates bits in the bitmask and routes each frame to its target.
+ */
+#define MAX_ARG_OFFSETS 4
+
+struct arg_track {
+ union {
+ s16 off[MAX_ARG_OFFSETS]; /* byte offsets; off_cnt says how many */
+ u16 mask; /* arg bitmask when arg == ARG_IMPRECISE */
+ };
+ s8 frame; /* absolute frame index, or enum arg_track_state */
+ s8 off_cnt; /* 0 = offset-imprecise, 1-4 = # of precise offsets */
+};
+
+enum arg_track_state {
+ ARG_NONE = -1, /* not derived from any argument */
+ ARG_UNVISITED = -2, /* not yet reached by dataflow */
+ ARG_IMPRECISE = -3, /* lost identity; .mask is arg bitmask */
+};
+
+#define OFF_IMPRECISE S16_MIN /* arg identity known but offset unknown */
+
+/* Track callee stack slots fp-8 through fp-512 (64 slots of 8 bytes each) */
+#define MAX_ARG_SPILL_SLOTS 64
+
+static bool arg_is_visited(const struct arg_track *at)
+{
+ return at->frame != ARG_UNVISITED;
+}
+
+static bool arg_is_fp(const struct arg_track *at)
+{
+ return at->frame >= 0 || at->frame == ARG_IMPRECISE;
+}
+
+/*
+ * Clear all tracked callee stack slots overlapping the byte range
+ * [off, off+sz-1] where off is a negative FP-relative offset.
+ */
+static void clear_overlapping_stack_slots(struct arg_track *at_stack, s16 off, u32 sz)
+{
+ struct arg_track none = { .frame = ARG_NONE };
+
+ if (off == OFF_IMPRECISE) {
+ for (int i = 0; i < MAX_ARG_SPILL_SLOTS; i++)
+ at_stack[i] = none;
+ return;
+ }
+ for (int i = 0; i < MAX_ARG_SPILL_SLOTS; i++) {
+ int slot_start = -((i + 1) * 8);
+ int slot_end = slot_start + 8;
+
+ if (slot_start < off + (int)sz && slot_end > off)
+ at_stack[i] = none;
+ }
+}
+
+static void verbose_arg_track(struct bpf_verifier_env *env, struct arg_track *at)
+{
+ int i;
+
+ switch (at->frame) {
+ case ARG_NONE: verbose(env, "_"); break;
+ case ARG_UNVISITED: verbose(env, "?"); break;
+ case ARG_IMPRECISE: verbose(env, "IMP%x", at->mask); break;
+ default:
+ /* frame >= 0: absolute frame index */
+ if (at->off_cnt == 0) {
+ verbose(env, "fp%d ?", at->frame);
+ } else {
+ for (i = 0; i < at->off_cnt; i++) {
+ if (i)
+ verbose(env, "|");
+ verbose(env, "fp%d%+d", at->frame, at->off[i]);
+ }
+ }
+ break;
+ }
+}
+
+static bool arg_track_eq(const struct arg_track *a, const struct arg_track *b)
+{
+ int i;
+
+ if (a->frame != b->frame)
+ return false;
+ if (a->frame == ARG_IMPRECISE)
+ return a->mask == b->mask;
+ if (a->frame < 0)
+ return true;
+ if (a->off_cnt != b->off_cnt)
+ return false;
+ for (i = 0; i < a->off_cnt; i++)
+ if (a->off[i] != b->off[i])
+ return false;
+ return true;
+}
+
+static struct arg_track arg_single(s8 arg, s16 off)
+{
+ struct arg_track at = {};
+
+ at.frame = arg;
+ at.off[0] = off;
+ at.off_cnt = 1;
+ return at;
+}
+
+/*
+ * Merge two sorted offset arrays, deduplicate.
+ * Returns off_cnt=0 if the result exceeds MAX_ARG_OFFSETS.
+ * Both args must have the same frame and off_cnt > 0.
+ */
+static struct arg_track arg_merge_offsets(struct arg_track a, struct arg_track b)
+{
+ struct arg_track result = { .frame = a.frame };
+ struct arg_track imp = { .frame = a.frame };
+ int i = 0, j = 0, k = 0;
+
+ while (i < a.off_cnt && j < b.off_cnt) {
+ s16 v;
+
+ if (a.off[i] <= b.off[j]) {
+ v = a.off[i++];
+ if (v == b.off[j])
+ j++;
+ } else {
+ v = b.off[j++];
+ }
+ if (k > 0 && result.off[k - 1] == v)
+ continue;
+ if (k >= MAX_ARG_OFFSETS)
+ return imp;
+ result.off[k++] = v;
+ }
+ while (i < a.off_cnt) {
+ if (k >= MAX_ARG_OFFSETS)
+ return imp;
+ result.off[k++] = a.off[i++];
+ }
+ while (j < b.off_cnt) {
+ if (k >= MAX_ARG_OFFSETS)
+ return imp;
+ result.off[k++] = b.off[j++];
+ }
+ result.off_cnt = k;
+ return result;
+}
+
+/*
+ * Merge two arg_tracks into ARG_IMPRECISE, collecting the frame
+ * bits from both operands. Precise frame indices (frame >= 0)
+ * contribute a single bit; existing ARG_IMPRECISE values
+ * contribute their full bitmask.
+ */
+static struct arg_track arg_join_imprecise(struct arg_track a, struct arg_track b)
+{
+ u32 m = 0;
+
+ if (a.frame >= 0)
+ m |= BIT(a.frame);
+ else if (a.frame == ARG_IMPRECISE)
+ m |= a.mask;
+
+ if (b.frame >= 0)
+ m |= BIT(b.frame);
+ else if (b.frame == ARG_IMPRECISE)
+ m |= b.mask;
+
+ return (struct arg_track){ .mask = m, .frame = ARG_IMPRECISE };
+}
+
+/* Join two arg_track values at merge points */
+static struct arg_track __arg_track_join(struct arg_track a, struct arg_track b)
+{
+ if (!arg_is_visited(&b))
+ return a;
+ if (!arg_is_visited(&a))
+ return b;
+ if (a.frame == b.frame && a.frame >= 0) {
+ /* Both offset-imprecise: stay imprecise */
+ if (a.off_cnt == 0 || b.off_cnt == 0)
+ return (struct arg_track){ .frame = a.frame };
+ /* Merge offset sets; falls back to off_cnt=0 if >4 */
+ return arg_merge_offsets(a, b);
+ }
+
+ /*
+ * args are different, but one of them is known
+ * arg + none -> arg
+ * none + arg -> arg
+ *
+ * none + none -> none
+ */
+ if (a.frame == ARG_NONE && b.frame == ARG_NONE)
+ return a;
+ if (a.frame >= 0 && b.frame == ARG_NONE) {
+ /*
+ * When joining single fp-N add fake fp+0 to
+ * keep stack_use and prevent stack_def
+ */
+ if (a.off_cnt == 1)
+ return arg_merge_offsets(a, arg_single(a.frame, 0));
+ return a;
+ }
+ if (b.frame >= 0 && a.frame == ARG_NONE) {
+ if (b.off_cnt == 1)
+ return arg_merge_offsets(b, arg_single(b.frame, 0));
+ return b;
+ }
+
+ return arg_join_imprecise(a, b);
+}
+
+static bool arg_track_join(struct bpf_verifier_env *env, int idx, int target, int r,
+ struct arg_track *in, struct arg_track out)
+{
+ struct arg_track old = *in;
+ struct arg_track new_val = __arg_track_join(old, out);
+
+ if (arg_track_eq(&new_val, &old))
+ return false;
+
+ *in = new_val;
+ if (!(env->log.level & BPF_LOG_LEVEL2) || !arg_is_visited(&old))
+ return true;
+
+ verbose(env, "arg JOIN insn %d -> %d ", idx, target);
+ if (r >= 0)
+ verbose(env, "r%d: ", r);
+ else
+ verbose(env, "fp%+d: ", r * 8);
+ verbose_arg_track(env, &old);
+ verbose(env, " + ");
+ verbose_arg_track(env, &out);
+ verbose(env, " => ");
+ verbose_arg_track(env, &new_val);
+ verbose(env, "\n");
+ return true;
+}
+
+/*
+ * Compute the result when an ALU op destroys offset precision.
+ * If a single arg is identifiable, preserve it with OFF_IMPRECISE.
+ * If two different args are involved or one is already ARG_IMPRECISE,
+ * the result is fully ARG_IMPRECISE.
+ */
+static void arg_track_alu64(struct arg_track *dst, const struct arg_track *src)
+{
+ WARN_ON_ONCE(!arg_is_visited(dst));
+ WARN_ON_ONCE(!arg_is_visited(src));
+
+ if (dst->frame >= 0 && (src->frame == ARG_NONE || src->frame == dst->frame)) {
+ /*
+ * rX += rY where rY is not arg derived
+ * rX += rX
+ */
+ dst->off_cnt = 0;
+ return;
+ }
+ if (src->frame >= 0 && dst->frame == ARG_NONE) {
+ /*
+ * rX += rY where rX is not arg derived
+ * rY identity leaks into rX
+ */
+ dst->off_cnt = 0;
+ dst->frame = src->frame;
+ return;
+ }
+
+ if (dst->frame == ARG_NONE && src->frame == ARG_NONE)
+ return;
+
+ *dst = arg_join_imprecise(*dst, *src);
+}
+
+static s16 arg_add(s16 off, s64 delta)
+{
+ s64 res;
+
+ if (off == OFF_IMPRECISE)
+ return OFF_IMPRECISE;
+ res = (s64)off + delta;
+ if (res < S16_MIN + 1 || res > S16_MAX)
+ return OFF_IMPRECISE;
+ return res;
+}
+
+static void arg_padd(struct arg_track *at, s64 delta)
+{
+ int i;
+
+ if (at->off_cnt == 0)
+ return;
+ for (i = 0; i < at->off_cnt; i++) {
+ s16 new_off = arg_add(at->off[i], delta);
+
+ if (new_off == OFF_IMPRECISE) {
+ at->off_cnt = 0;
+ return;
+ }
+ at->off[i] = new_off;
+ }
+}
+
+/*
+ * Convert a byte offset from FP to a callee stack slot index.
+ * Returns -1 if out of range or not 8-byte aligned.
+ * Slot 0 = fp-8, slot 1 = fp-16, ..., slot 7 = fp-64, ....
+ */
+static int fp_off_to_slot(s16 off)
+{
+ if (off == OFF_IMPRECISE)
+ return -1;
+ if (off >= 0 || off < -(int)(MAX_ARG_SPILL_SLOTS * 8))
+ return -1;
+ if (off % 8)
+ return -1;
+ return (-off) / 8 - 1;
+}
+
+static struct arg_track fill_from_stack(struct bpf_insn *insn,
+ struct arg_track *at_out, int reg,
+ struct arg_track *at_stack_out,
+ int depth)
+{
+ struct arg_track imp = {
+ .mask = (1u << (depth + 1)) - 1,
+ .frame = ARG_IMPRECISE
+ };
+ struct arg_track result = { .frame = ARG_NONE };
+ int cnt, i;
+
+ if (reg == BPF_REG_FP) {
+ int slot = fp_off_to_slot(insn->off);
+
+ return slot >= 0 ? at_stack_out[slot] : imp;
+ }
+ cnt = at_out[reg].off_cnt;
+ if (cnt == 0)
+ return imp;
+
+ for (i = 0; i < cnt; i++) {
+ s16 fp_off = arg_add(at_out[reg].off[i], insn->off);
+ int slot = fp_off_to_slot(fp_off);
+
+ if (slot < 0)
+ return imp;
+ result = __arg_track_join(result, at_stack_out[slot]);
+ }
+ return result;
+}
+
+/*
+ * Spill @val to all possible stack slots indicated by the FP offsets in @reg.
+ * For an 8-byte store, single candidate slot gets @val. multi-slots are joined.
+ * sub-8-byte store joins with ARG_NONE.
+ * When exact offset is unknown conservatively add reg values to all slots in at_stack_out.
+ */
+static void spill_to_stack(struct bpf_insn *insn, struct arg_track *at_out,
+ int reg, struct arg_track *at_stack_out,
+ struct arg_track *val, u32 sz)
+{
+ struct arg_track none = { .frame = ARG_NONE };
+ struct arg_track new_val = sz == 8 ? *val : none;
+ int cnt, i;
+
+ if (reg == BPF_REG_FP) {
+ int slot = fp_off_to_slot(insn->off);
+
+ if (slot >= 0)
+ at_stack_out[slot] = new_val;
+ return;
+ }
+ cnt = at_out[reg].off_cnt;
+ if (cnt == 0) {
+ for (int slot = 0; slot < MAX_ARG_SPILL_SLOTS; slot++)
+ at_stack_out[slot] = __arg_track_join(at_stack_out[slot], new_val);
+ return;
+ }
+ for (i = 0; i < cnt; i++) {
+ s16 fp_off = arg_add(at_out[reg].off[i], insn->off);
+ int slot = fp_off_to_slot(fp_off);
+
+ if (slot < 0)
+ continue;
+ if (cnt == 1)
+ at_stack_out[slot] = new_val;
+ else
+ at_stack_out[slot] = __arg_track_join(at_stack_out[slot], new_val);
+ }
+}
+
+/*
+ * Clear stack slots overlapping all possible FP offsets in @reg.
+ */
+static void clear_stack_for_all_offs(struct bpf_insn *insn,
+ struct arg_track *at_out, int reg,
+ struct arg_track *at_stack_out, u32 sz)
+{
+ int cnt, i;
+
+ if (reg == BPF_REG_FP) {
+ clear_overlapping_stack_slots(at_stack_out, insn->off, sz);
+ return;
+ }
+ cnt = at_out[reg].off_cnt;
+ if (cnt == 0) {
+ clear_overlapping_stack_slots(at_stack_out, OFF_IMPRECISE, sz);
+ return;
+ }
+ for (i = 0; i < cnt; i++) {
+ s16 fp_off = arg_add(at_out[reg].off[i], insn->off);
+
+ clear_overlapping_stack_slots(at_stack_out, fp_off, sz);
+ }
+}
+
+static void arg_track_log(struct bpf_verifier_env *env, struct bpf_insn *insn, int idx,
+ struct arg_track *at_in, struct arg_track *at_stack_in,
+ struct arg_track *at_out, struct arg_track *at_stack_out)
+{
+ bool printed = false;
+ int i;
+
+ if (!(env->log.level & BPF_LOG_LEVEL2))
+ return;
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ if (arg_track_eq(&at_out[i], &at_in[i]))
+ continue;
+ if (!printed) {
+ verbose(env, "%3d: ", idx);
+ bpf_verbose_insn(env, insn);
+ bpf_vlog_reset(&env->log, env->log.end_pos - 1);
+ printed = true;
+ }
+ verbose(env, "\tr%d: ", i); verbose_arg_track(env, &at_in[i]);
+ verbose(env, " -> "); verbose_arg_track(env, &at_out[i]);
+ }
+ for (i = 0; i < MAX_ARG_SPILL_SLOTS; i++) {
+ if (arg_track_eq(&at_stack_out[i], &at_stack_in[i]))
+ continue;
+ if (!printed) {
+ verbose(env, "%3d: ", idx);
+ bpf_verbose_insn(env, insn);
+ bpf_vlog_reset(&env->log, env->log.end_pos - 1);
+ printed = true;
+ }
+ verbose(env, "\tfp%+d: ", -(i + 1) * 8); verbose_arg_track(env, &at_stack_in[i]);
+ verbose(env, " -> "); verbose_arg_track(env, &at_stack_out[i]);
+ }
+ if (printed)
+ verbose(env, "\n");
+}
+
+/*
+ * Pure dataflow transfer function for arg_track state.
+ * Updates at_out[] based on how the instruction modifies registers.
+ * Tracks spill/fill, but not other memory accesses.
+ */
+static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int insn_idx,
+ struct arg_track *at_out, struct arg_track *at_stack_out,
+ struct func_instance *instance,
+ u32 *callsites)
+{
+ int depth = instance->depth;
+ u8 class = BPF_CLASS(insn->code);
+ u8 code = BPF_OP(insn->code);
+ struct arg_track *dst = &at_out[insn->dst_reg];
+ struct arg_track *src = &at_out[insn->src_reg];
+ struct arg_track none = { .frame = ARG_NONE };
+ int r;
+
+ if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_K) {
+ if (code == BPF_MOV) {
+ *dst = none;
+ } else if (dst->frame >= 0) {
+ if (code == BPF_ADD)
+ arg_padd(dst, insn->imm);
+ else if (code == BPF_SUB)
+ arg_padd(dst, -(s64)insn->imm);
+ else
+ /* Any other 64-bit alu on the pointer makes it imprecise */
+ dst->off_cnt = 0;
+ } /* else if dst->frame is imprecise it stays so */
+ } else if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_X) {
+ if (code == BPF_MOV) {
+ if (insn->off == 0) {
+ *dst = *src;
+ } else {
+ /* addr_space_cast destroys a pointer */
+ *dst = none;
+ }
+ } else {
+ arg_track_alu64(dst, src);
+ }
+ } else if (class == BPF_ALU) {
+ /*
+ * 32-bit alu destroys the pointer.
+ * If src was a pointer it cannot leak into dst
+ */
+ *dst = none;
+ } else if (class == BPF_JMP && code == BPF_CALL) {
+ /*
+ * at_stack_out[slot] is not cleared by the helper and subprog calls.
+ * The fill_from_stack() may return the stale spill — which is an FP-derived arg_track
+ * (the value that was originally spilled there). The loaded register then carries
+ * a phantom FP-derived identity that doesn't correspond to what's actually in the slot.
+ * This phantom FP pointer propagates forward, and wherever it's subsequently used
+ * (as a helper argument, another store, etc.), it sets stack liveness bits.
+ * Those bits correspond to stack accesses that don't actually happen.
+ * So the effect is over-reporting stack liveness — marking slots as live that aren't
+ * actually accessed. The verifier preserves more state than necessary across calls,
+ * which is conservative.
+ *
+ * helpers can scratch stack slots, but they won't make a valid pointer out of it.
+ * subprogs are allowed to write into parent slots, but they cannot write
+ * _any_ FP-derived pointer into it (either their own or parent's FP).
+ */
+ for (r = BPF_REG_0; r <= BPF_REG_5; r++)
+ at_out[r] = none;
+ } else if (class == BPF_LDX) {
+ u32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code));
+ bool src_is_local_fp = insn->src_reg == BPF_REG_FP || src->frame == depth ||
+ (src->frame == ARG_IMPRECISE && (src->mask & BIT(depth)));
+
+ /*
+ * Reload from callee stack: if src is current-frame FP-derived
+ * and the load is an 8-byte BPF_MEM, try to restore the spill
+ * identity. For imprecise sources fill_from_stack() returns
+ * ARG_IMPRECISE (off_cnt == 0).
+ */
+ if (src_is_local_fp && BPF_MODE(insn->code) == BPF_MEM && sz == 8) {
+ *dst = fill_from_stack(insn, at_out, insn->src_reg, at_stack_out, depth);
+ } else if (src->frame >= 0 && src->frame < depth &&
+ BPF_MODE(insn->code) == BPF_MEM && sz == 8) {
+ struct arg_track *parent_stack =
+ env->callsite_at_stack[callsites[src->frame]];
+
+ *dst = fill_from_stack(insn, at_out, insn->src_reg,
+ parent_stack, src->frame);
+ } else if (src->frame == ARG_IMPRECISE &&
+ !(src->mask & BIT(depth)) && src->mask &&
+ BPF_MODE(insn->code) == BPF_MEM && sz == 8) {
+ /*
+ * Imprecise src with only parent-frame bits:
+ * conservative fallback.
+ */
+ *dst = *src;
+ } else {
+ *dst = none;
+ }
+ } else if (class == BPF_LD && BPF_MODE(insn->code) == BPF_IMM) {
+ *dst = none;
+ } else if (class == BPF_STX) {
+ u32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code));
+ bool dst_is_local_fp;
+
+ /* Track spills to current-frame FP-derived callee stack */
+ dst_is_local_fp = insn->dst_reg == BPF_REG_FP || dst->frame == depth;
+ if (dst_is_local_fp && BPF_MODE(insn->code) == BPF_MEM)
+ spill_to_stack(insn, at_out, insn->dst_reg,
+ at_stack_out, src, sz);
+
+ if (BPF_MODE(insn->code) == BPF_ATOMIC) {
+ if (dst_is_local_fp && insn->imm != BPF_LOAD_ACQ)
+ clear_stack_for_all_offs(insn, at_out, insn->dst_reg,
+ at_stack_out, sz);
+
+ if (insn->imm == BPF_CMPXCHG)
+ at_out[BPF_REG_0] = none;
+ else if (insn->imm == BPF_LOAD_ACQ)
+ *dst = none;
+ else if (insn->imm & BPF_FETCH)
+ *src = none;
+ }
+ } else if (class == BPF_ST && BPF_MODE(insn->code) == BPF_MEM) {
+ u32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code));
+ bool dst_is_local_fp = insn->dst_reg == BPF_REG_FP || dst->frame == depth;
+
+ /* BPF_ST to FP-derived dst: clear overlapping stack slots */
+ if (dst_is_local_fp)
+ clear_stack_for_all_offs(insn, at_out, insn->dst_reg,
+ at_stack_out, sz);
+ }
+}
+
+/*
+ * Record access_bytes from helper/kfunc or load/store insn.
+ * access_bytes > 0: stack read
+ * access_bytes < 0: stack write
+ * access_bytes == S64_MIN: unknown — conservative, mark [0..slot] as read
+ * access_bytes == 0: no access
+ *
+ */
+static int record_stack_access_off(struct func_instance *instance, s64 fp_off,
+ s64 access_bytes, u32 frame, u32 insn_idx)
+{
+ s32 slot_hi, slot_lo;
+ spis_t mask;
+
+ if (fp_off >= 0)
+ /*
+ * out of bounds stack access doesn't contribute
+ * into actual stack liveness. It will be rejected
+ * by the main verifier pass later.
+ */
+ return 0;
+ if (access_bytes == S64_MIN) {
+ /* helper/kfunc read unknown amount of bytes from fp_off until fp+0 */
+ slot_hi = (-fp_off - 1) / STACK_SLOT_SZ;
+ mask = SPIS_ZERO;
+ spis_or_range(&mask, 0, slot_hi);
+ return mark_stack_read(instance, frame, insn_idx, mask);
+ }
+ if (access_bytes > 0) {
+ /* Mark any touched slot as use */
+ slot_hi = (-fp_off - 1) / STACK_SLOT_SZ;
+ slot_lo = max_t(s32, (-fp_off - access_bytes) / STACK_SLOT_SZ, 0);
+ mask = SPIS_ZERO;
+ spis_or_range(&mask, slot_lo, slot_hi);
+ return mark_stack_read(instance, frame, insn_idx, mask);
+ } else if (access_bytes < 0) {
+ /* Mark only fully covered slots as def */
+ access_bytes = -access_bytes;
+ slot_hi = (-fp_off) / STACK_SLOT_SZ - 1;
+ slot_lo = max_t(s32, (-fp_off - access_bytes + STACK_SLOT_SZ - 1) / STACK_SLOT_SZ, 0);
+ if (slot_lo <= slot_hi) {
+ mask = SPIS_ZERO;
+ spis_or_range(&mask, slot_lo, slot_hi);
+ return mark_stack_write(instance, frame, insn_idx, mask);
+ }
+ }
+ return 0;
+}
+
+/*
+ * 'arg' is FP-derived argument to helper/kfunc or load/store that
+ * reads (positive) or writes (negative) 'access_bytes' into 'use' or 'def'.
+ */
+static int record_stack_access(struct func_instance *instance,
+ const struct arg_track *arg,
+ s64 access_bytes, u32 frame, u32 insn_idx)
+{
+ int i, err;
+
+ if (access_bytes == 0)
+ return 0;
+ if (arg->off_cnt == 0) {
+ if (access_bytes > 0 || access_bytes == S64_MIN)
+ return mark_stack_read(instance, frame, insn_idx, SPIS_ALL);
+ return 0;
+ }
+ if (access_bytes != S64_MIN && access_bytes < 0 && arg->off_cnt != 1)
+ /* multi-offset write cannot set stack_def */
+ return 0;
+
+ for (i = 0; i < arg->off_cnt; i++) {
+ err = record_stack_access_off(instance, arg->off[i], access_bytes, frame, insn_idx);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+/*
+ * When a pointer is ARG_IMPRECISE, conservatively mark every frame in
+ * the bitmask as fully used.
+ */
+static int record_imprecise(struct func_instance *instance, u32 mask, u32 insn_idx)
+{
+ int depth = instance->depth;
+ int f, err;
+
+ for (f = 0; mask; f++, mask >>= 1) {
+ if (!(mask & 1))
+ continue;
+ if (f <= depth) {
+ err = mark_stack_read(instance, f, insn_idx, SPIS_ALL);
+ if (err)
+ return err;
+ }
+ }
+ return 0;
+}
+
+/* Record load/store access for a given 'at' state of 'insn'. */
+static int record_load_store_access(struct bpf_verifier_env *env,
+ struct func_instance *instance,
+ struct arg_track *at, int insn_idx)
+{
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
+ int depth = instance->depth;
+ s32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code));
+ u8 class = BPF_CLASS(insn->code);
+ struct arg_track resolved, *ptr;
+ int oi;
+
+ switch (class) {
+ case BPF_LDX:
+ ptr = &at[insn->src_reg];
+ break;
+ case BPF_STX:
+ if (BPF_MODE(insn->code) == BPF_ATOMIC) {
+ if (insn->imm == BPF_STORE_REL)
+ sz = -sz;
+ if (insn->imm == BPF_LOAD_ACQ)
+ ptr = &at[insn->src_reg];
+ else
+ ptr = &at[insn->dst_reg];
+ } else {
+ ptr = &at[insn->dst_reg];
+ sz = -sz;
+ }
+ break;
+ case BPF_ST:
+ ptr = &at[insn->dst_reg];
+ sz = -sz;
+ break;
+ default:
+ return 0;
+ }
+
+ /* Resolve offsets: fold insn->off into arg_track */
+ if (ptr->off_cnt > 0) {
+ resolved.off_cnt = ptr->off_cnt;
+ resolved.frame = ptr->frame;
+ for (oi = 0; oi < ptr->off_cnt; oi++) {
+ resolved.off[oi] = arg_add(ptr->off[oi], insn->off);
+ if (resolved.off[oi] == OFF_IMPRECISE) {
+ resolved.off_cnt = 0;
+ break;
+ }
+ }
+ ptr = &resolved;
+ }
+
+ if (ptr->frame >= 0 && ptr->frame <= depth)
+ return record_stack_access(instance, ptr, sz, ptr->frame, insn_idx);
+ if (ptr->frame == ARG_IMPRECISE)
+ return record_imprecise(instance, ptr->mask, insn_idx);
+ /* ARG_NONE: not derived from any frame pointer, skip */
+ return 0;
+}
+
+/* Record stack access for a given 'at' state of helper/kfunc 'insn' */
+static int record_call_access(struct bpf_verifier_env *env,
+ struct func_instance *instance,
+ struct arg_track *at,
+ int insn_idx)
+{
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
+ int depth = instance->depth;
+ struct bpf_call_summary cs;
+ int r, err = 0, num_params = 5;
+
+ if (bpf_pseudo_call(insn))
+ return 0;
+
+ if (bpf_get_call_summary(env, insn, &cs))
+ num_params = cs.num_params;
+
+ for (r = BPF_REG_1; r < BPF_REG_1 + num_params; r++) {
+ int frame = at[r].frame;
+ s64 bytes;
+
+ if (!arg_is_fp(&at[r]))
+ continue;
+
+ if (bpf_helper_call(insn)) {
+ bytes = bpf_helper_stack_access_bytes(env, insn, r - 1, insn_idx);
+ } else if (bpf_pseudo_kfunc_call(insn)) {
+ bytes = bpf_kfunc_stack_access_bytes(env, insn, r - 1, insn_idx);
+ } else {
+ for (int f = 0; f <= depth; f++) {
+ err = mark_stack_read(instance, f, insn_idx, SPIS_ALL);
+ if (err)
+ return err;
+ }
+ return 0;
+ }
+ if (bytes == 0)
+ continue;
+
+ if (frame >= 0 && frame <= depth)
+ err = record_stack_access(instance, &at[r], bytes, frame, insn_idx);
+ else if (frame == ARG_IMPRECISE)
+ err = record_imprecise(instance, at[r].mask, insn_idx);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+/*
+ * For a calls_callback helper, find the callback subprog and determine
+ * which caller register maps to which callback register for FP passthrough.
+ */
+static int find_callback_subprog(struct bpf_verifier_env *env,
+ struct bpf_insn *insn, int insn_idx,
+ int *caller_reg, int *callee_reg)
+{
+ struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+ int cb_reg = -1;
+
+ *caller_reg = -1;
+ *callee_reg = -1;
+
+ if (!bpf_helper_call(insn))
+ return -1;
+ switch (insn->imm) {
+ case BPF_FUNC_loop:
+ /* bpf_loop(nr, cb, ctx, flags): cb=R2, R3->cb R2 */
+ cb_reg = BPF_REG_2;
+ *caller_reg = BPF_REG_3;
+ *callee_reg = BPF_REG_2;
+ break;
+ case BPF_FUNC_for_each_map_elem:
+ /* for_each_map_elem(map, cb, ctx, flags): cb=R2, R3->cb R4 */
+ cb_reg = BPF_REG_2;
+ *caller_reg = BPF_REG_3;
+ *callee_reg = BPF_REG_4;
+ break;
+ case BPF_FUNC_find_vma:
+ /* find_vma(task, addr, cb, ctx, flags): cb=R3, R4->cb R3 */
+ cb_reg = BPF_REG_3;
+ *caller_reg = BPF_REG_4;
+ *callee_reg = BPF_REG_3;
+ break;
+ case BPF_FUNC_user_ringbuf_drain:
+ /* user_ringbuf_drain(map, cb, ctx, flags): cb=R2, R3->cb R2 */
+ cb_reg = BPF_REG_2;
+ *caller_reg = BPF_REG_3;
+ *callee_reg = BPF_REG_2;
+ break;
+ default:
+ return -1;
+ }
+
+ if (!(aux->const_reg_subprog_mask & BIT(cb_reg)))
+ return -2;
+
+ return aux->const_reg_vals[cb_reg];
+}
+
+/* Per-subprog intermediate state kept alive across analysis phases */
+struct subprog_at_info {
+ struct arg_track (*at_in)[MAX_BPF_REG];
+ int len;
+};
+
+static void print_subprog_arg_access(struct bpf_verifier_env *env,
+ int subprog,
+ struct subprog_at_info *info,
+ struct arg_track (*at_stack_in)[MAX_ARG_SPILL_SLOTS])
+{
+ struct bpf_insn *insns = env->prog->insnsi;
+ int start = env->subprog_info[subprog].start;
+ int len = info->len;
+ int i, r;
+
+ if (!(env->log.level & BPF_LOG_LEVEL2))
+ return;
+
+ verbose(env, "%s:\n", fmt_subprog(env, subprog));
+ for (i = 0; i < len; i++) {
+ int idx = start + i;
+ bool has_extra = false;
+ u8 cls = BPF_CLASS(insns[idx].code);
+ bool is_ldx_stx_call = cls == BPF_LDX || cls == BPF_STX ||
+ insns[idx].code == (BPF_JMP | BPF_CALL);
+
+ verbose(env, "%3d: ", idx);
+ bpf_verbose_insn(env, &insns[idx]);
+
+ /* Collect what needs printing */
+ if (is_ldx_stx_call &&
+ arg_is_visited(&info->at_in[i][0])) {
+ for (r = 0; r < MAX_BPF_REG - 1; r++)
+ if (arg_is_fp(&info->at_in[i][r]))
+ has_extra = true;
+ }
+ if (is_ldx_stx_call) {
+ for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
+ if (arg_is_fp(&at_stack_in[i][r]))
+ has_extra = true;
+ }
+
+ if (!has_extra) {
+ if (bpf_is_ldimm64(&insns[idx]))
+ i++;
+ continue;
+ }
+
+ bpf_vlog_reset(&env->log, env->log.end_pos - 1);
+ verbose(env, " //");
+
+ if (is_ldx_stx_call && info->at_in &&
+ arg_is_visited(&info->at_in[i][0])) {
+ for (r = 0; r < MAX_BPF_REG - 1; r++) {
+ if (!arg_is_fp(&info->at_in[i][r]))
+ continue;
+ verbose(env, " r%d=", r);
+ verbose_arg_track(env, &info->at_in[i][r]);
+ }
+ }
+
+ if (is_ldx_stx_call) {
+ for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) {
+ if (!arg_is_fp(&at_stack_in[i][r]))
+ continue;
+ verbose(env, " fp%+d=", -(r + 1) * 8);
+ verbose_arg_track(env, &at_stack_in[i][r]);
+ }
+ }
+
+ verbose(env, "\n");
+ if (bpf_is_ldimm64(&insns[idx]))
+ i++;
+ }
+}
+
+/*
+ * Compute arg tracking dataflow for a single subprog.
+ * Runs forward fixed-point with arg_track_xfer(), then records
+ * memory accesses in a single linear pass over converged state.
+ *
+ * @callee_entry: pre-populated entry state for R1-R5
+ * NULL for main (subprog 0).
+ * @info: stores at_in, len for debug printing.
+ */
+static int compute_subprog_args(struct bpf_verifier_env *env,
+ struct subprog_at_info *info,
+ struct arg_track *callee_entry,
+ struct func_instance *instance,
+ u32 *callsites)
+{
+ int subprog = instance->subprog;
+ struct bpf_insn *insns = env->prog->insnsi;
+ int depth = instance->depth;
+ int start = env->subprog_info[subprog].start;
+ int po_start = env->subprog_info[subprog].postorder_start;
+ int end = env->subprog_info[subprog + 1].start;
+ int po_end = env->subprog_info[subprog + 1].postorder_start;
+ int len = end - start;
+ struct arg_track (*at_in)[MAX_BPF_REG] = NULL;
+ struct arg_track at_out[MAX_BPF_REG];
+ struct arg_track (*at_stack_in)[MAX_ARG_SPILL_SLOTS] = NULL;
+ struct arg_track *at_stack_out = NULL;
+ struct arg_track unvisited = { .frame = ARG_UNVISITED };
+ struct arg_track none = { .frame = ARG_NONE };
+ bool changed;
+ int i, p, r, err = -ENOMEM;
+
+ at_in = kvmalloc_objs(*at_in, len, GFP_KERNEL_ACCOUNT);
+ if (!at_in)
+ goto err_free;
+
+ at_stack_in = kvmalloc_objs(*at_stack_in, len, GFP_KERNEL_ACCOUNT);
+ if (!at_stack_in)
+ goto err_free;
+
+ at_stack_out = kvmalloc_objs(*at_stack_out, MAX_ARG_SPILL_SLOTS, GFP_KERNEL_ACCOUNT);
+ if (!at_stack_out)
+ goto err_free;
+
+ for (i = 0; i < len; i++) {
+ for (r = 0; r < MAX_BPF_REG; r++)
+ at_in[i][r] = unvisited;
+ for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
+ at_stack_in[i][r] = unvisited;
+ }
+
+ for (r = 0; r < MAX_BPF_REG; r++)
+ at_in[0][r] = none;
+
+ /* Entry: R10 is always precisely the current frame's FP */
+ at_in[0][BPF_REG_FP] = arg_single(depth, 0);
+
+ /* R1-R5: from caller or ARG_NONE for main */
+ if (callee_entry) {
+ for (r = BPF_REG_1; r <= BPF_REG_5; r++)
+ at_in[0][r] = callee_entry[r];
+ }
+
+ /* Entry: all stack slots are ARG_NONE */
+ for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
+ at_stack_in[0][r] = none;
+
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "subprog#%d: analyzing (depth %d)...\n", subprog, depth);
+
+ /* Forward fixed-point iteration in reverse post order */
+redo:
+ changed = false;
+ for (p = po_end - 1; p >= po_start; p--) {
+ int idx = env->cfg.insn_postorder[p];
+ int i = idx - start;
+ struct bpf_insn *insn = &insns[idx];
+ struct bpf_iarray *succ;
+
+ if (!arg_is_visited(&at_in[i][0]) && !arg_is_visited(&at_in[i][1]))
+ continue;
+
+ memcpy(at_out, at_in[i], sizeof(at_out));
+ memcpy(at_stack_out, at_stack_in[i], MAX_ARG_SPILL_SLOTS * sizeof(*at_stack_out));
+
+ arg_track_xfer(env, insn, idx, at_out, at_stack_out, instance, callsites);
+ arg_track_log(env, insn, idx, at_in[i], at_stack_in[i], at_out, at_stack_out);
+
+ /* Propagate to successors within this subprogram */
+ succ = bpf_insn_successors(env, idx);
+ for (int s = 0; s < succ->cnt; s++) {
+ int target = succ->items[s];
+ int ti;
+
+ /* Filter: stay within the subprogram's range */
+ if (target < start || target >= end)
+ continue;
+ ti = target - start;
+
+ for (r = 0; r < MAX_BPF_REG; r++)
+ changed |= arg_track_join(env, idx, target, r,
+ &at_in[ti][r], at_out[r]);
+
+ for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
+ changed |= arg_track_join(env, idx, target, -r - 1,
+ &at_stack_in[ti][r], at_stack_out[r]);
+ }
+ }
+ if (changed)
+ goto redo;
+
+ /* Record memory accesses using converged at_in (RPO skips dead code) */
+ for (p = po_end - 1; p >= po_start; p--) {
+ int idx = env->cfg.insn_postorder[p];
+ int i = idx - start;
+ struct bpf_insn *insn = &insns[idx];
+
+ err = record_load_store_access(env, instance, at_in[i], idx);
+ if (err)
+ goto err_free;
+
+ if (insn->code == (BPF_JMP | BPF_CALL)) {
+ err = record_call_access(env, instance, at_in[i], idx);
+ if (err)
+ goto err_free;
+ }
+
+ if (bpf_pseudo_call(insn) || bpf_calls_callback(env, idx)) {
+ kvfree(env->callsite_at_stack[idx]);
+ env->callsite_at_stack[idx] =
+ kvmalloc_objs(*env->callsite_at_stack[idx],
+ MAX_ARG_SPILL_SLOTS, GFP_KERNEL_ACCOUNT);
+ if (!env->callsite_at_stack[idx]) {
+ err = -ENOMEM;
+ goto err_free;
+ }
+ memcpy(env->callsite_at_stack[idx],
+ at_stack_in[i], sizeof(struct arg_track) * MAX_ARG_SPILL_SLOTS);
+ }
+ }
+
+ info->at_in = at_in;
+ at_in = NULL;
+ info->len = len;
+ print_subprog_arg_access(env, subprog, info, at_stack_in);
+ err = 0;
+
+err_free:
+ kvfree(at_stack_out);
+ kvfree(at_stack_in);
+ kvfree(at_in);
+ return err;
+}
+
+/* Return true if any of R1-R5 is derived from a frame pointer. */
+static bool has_fp_args(struct arg_track *args)
+{
+ for (int r = BPF_REG_1; r <= BPF_REG_5; r++)
+ if (args[r].frame != ARG_NONE)
+ return true;
+ return false;
+}
+
+/*
+ * Merge a freshly analyzed instance into the original.
+ * may_read: union (any pass might read the slot).
+ * must_write: intersection (only slots written on ALL passes are guaranteed).
+ * live_before is recomputed by a subsequent update_instance() on @dst.
+ */
+static void merge_instances(struct func_instance *dst, struct func_instance *src)
+{
+ int f, i;
+
+ for (f = 0; f <= dst->depth; f++) {
+ if (!src->frames[f]) {
+ /* This pass didn't touch frame f — must_write intersects with empty. */
+ if (dst->frames[f])
+ for (i = 0; i < dst->insn_cnt; i++)
+ dst->frames[f][i].must_write = SPIS_ZERO;
+ continue;
+ }
+ if (!dst->frames[f]) {
+ /* Previous pass didn't touch frame f — take src, zero must_write. */
+ dst->frames[f] = src->frames[f];
+ src->frames[f] = NULL;
+ for (i = 0; i < dst->insn_cnt; i++)
+ dst->frames[f][i].must_write = SPIS_ZERO;
+ continue;
+ }
+ for (i = 0; i < dst->insn_cnt; i++) {
+ dst->frames[f][i].may_read =
+ spis_or(dst->frames[f][i].may_read,
+ src->frames[f][i].may_read);
+ dst->frames[f][i].must_write =
+ spis_and(dst->frames[f][i].must_write,
+ src->frames[f][i].must_write);
+ }
+ }
+}
+
+static struct func_instance *fresh_instance(struct func_instance *src)
+{
+ struct func_instance *f;
+
+ f = kvzalloc_obj(*f, GFP_KERNEL_ACCOUNT);
+ if (!f)
+ return ERR_PTR(-ENOMEM);
+ f->callsite = src->callsite;
+ f->depth = src->depth;
+ f->subprog = src->subprog;
+ f->subprog_start = src->subprog_start;
+ f->insn_cnt = src->insn_cnt;
+ return f;
+}
+
+static void free_instance(struct func_instance *instance)
+{
+ int i;
+
+ for (i = 0; i <= instance->depth; i++)
+ kvfree(instance->frames[i]);
+ kvfree(instance);
+}
+
+/*
+ * Recursively analyze a subprog with specific 'entry_args'.
+ * Each callee is analyzed with the exact args from its call site.
+ *
+ * Args are recomputed for each call because the dataflow result at_in[]
+ * depends on the entry args and frame depth. Consider: A->C->D and B->C->D
+ * Callsites in A and B pass different args into C, so C is recomputed.
+ * Then within C the same callsite passes different args into D.
+ */
+static int analyze_subprog(struct bpf_verifier_env *env,
+ struct arg_track *entry_args,
+ struct subprog_at_info *info,
+ struct func_instance *instance,
+ u32 *callsites)
+{
+ int subprog = instance->subprog;
+ int depth = instance->depth;
+ struct bpf_insn *insns = env->prog->insnsi;
+ int start = env->subprog_info[subprog].start;
+ int po_start = env->subprog_info[subprog].postorder_start;
+ int po_end = env->subprog_info[subprog + 1].postorder_start;
+ struct func_instance *prev_instance = NULL;
+ int j, err;
+
+ if (++env->liveness->subprog_calls > 10000) {
+ verbose(env, "liveness analysis exceeded complexity limit (%d calls)\n",
+ env->liveness->subprog_calls);
+ return -E2BIG;
+ }
+
+ if (need_resched())
+ cond_resched();
+
+
+ /*
+ * When an instance is reused (must_write_initialized == true),
+ * record into a fresh instance and merge afterward. This avoids
+ * stale must_write marks for instructions not reached in this pass.
+ */
+ if (instance->must_write_initialized) {
+ struct func_instance *fresh = fresh_instance(instance);
+
+ if (IS_ERR(fresh))
+ return PTR_ERR(fresh);
+ prev_instance = instance;
+ instance = fresh;
+ }
+
+ /* Free prior analysis if this subprog was already visited */
+ kvfree(info[subprog].at_in);
+ info[subprog].at_in = NULL;
+
+ err = compute_subprog_args(env, &info[subprog], entry_args, instance, callsites);
+ if (err)
+ goto out_free;
+
+ /* For each reachable call site in the subprog, recurse into callees */
+ for (int p = po_start; p < po_end; p++) {
+ int idx = env->cfg.insn_postorder[p];
+ struct arg_track callee_args[BPF_REG_5 + 1];
+ struct arg_track none = { .frame = ARG_NONE };
+ struct bpf_insn *insn = &insns[idx];
+ struct func_instance *callee_instance;
+ int callee, target;
+ int caller_reg, cb_callee_reg;
+
+ j = idx - start; /* relative index within this subprog */
+
+ if (bpf_pseudo_call(insn)) {
+ target = idx + insn->imm + 1;
+ callee = bpf_find_subprog(env, target);
+ if (callee < 0)
+ continue;
+
+ /* Build entry args: R1-R5 from at_in at call site */
+ for (int r = BPF_REG_1; r <= BPF_REG_5; r++)
+ callee_args[r] = info[subprog].at_in[j][r];
+ } else if (bpf_calls_callback(env, idx)) {
+ callee = find_callback_subprog(env, insn, idx, &caller_reg, &cb_callee_reg);
+ if (callee == -2) {
+ /*
+ * same bpf_loop() calls two different callbacks and passes
+ * stack pointer to them
+ */
+ if (info[subprog].at_in[j][caller_reg].frame == ARG_NONE)
+ continue;
+ for (int f = 0; f <= depth; f++) {
+ err = mark_stack_read(instance, f, idx, SPIS_ALL);
+ if (err)
+ goto out_free;
+ }
+ continue;
+ }
+ if (callee < 0)
+ continue;
+
+ for (int r = BPF_REG_1; r <= BPF_REG_5; r++)
+ callee_args[r] = none;
+ callee_args[cb_callee_reg] = info[subprog].at_in[j][caller_reg];
+ } else {
+ continue;
+ }
+
+ if (!has_fp_args(callee_args))
+ continue;
+
+ if (depth == MAX_CALL_FRAMES - 1) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ callee_instance = call_instance(env, instance, idx, callee);
+ if (IS_ERR(callee_instance)) {
+ err = PTR_ERR(callee_instance);
+ goto out_free;
+ }
+ callsites[depth] = idx;
+ err = analyze_subprog(env, callee_args, info, callee_instance, callsites);
+ if (err)
+ goto out_free;
+
+ /* Pull callee's entry liveness back to caller's callsite */
+ {
+ u32 callee_start = callee_instance->subprog_start;
+ struct per_frame_masks *entry;
+
+ for (int f = 0; f < callee_instance->depth; f++) {
+ entry = get_frame_masks(callee_instance, f, callee_start);
+ if (!entry)
+ continue;
+ err = mark_stack_read(instance, f, idx, entry->live_before);
+ if (err)
+ goto out_free;
+ }
+ }
+ }
+
+ if (prev_instance) {
+ merge_instances(prev_instance, instance);
+ free_instance(instance);
+ instance = prev_instance;
+ }
+ update_instance(env, instance);
+ return 0;
+
+out_free:
+ if (prev_instance)
+ free_instance(instance);
+ return err;
+}
+
+int bpf_compute_subprog_arg_access(struct bpf_verifier_env *env)
+{
+ u32 callsites[MAX_CALL_FRAMES] = {};
+ int insn_cnt = env->prog->len;
+ struct func_instance *instance;
+ struct subprog_at_info *info;
+ int k, err = 0;
+
+ info = kvzalloc_objs(*info, env->subprog_cnt, GFP_KERNEL_ACCOUNT);
+ if (!info)
+ return -ENOMEM;
+
+ env->callsite_at_stack = kvzalloc_objs(*env->callsite_at_stack, insn_cnt,
+ GFP_KERNEL_ACCOUNT);
+ if (!env->callsite_at_stack) {
+ kvfree(info);
+ return -ENOMEM;
+ }
+
+ instance = call_instance(env, NULL, 0, 0);
+ if (IS_ERR(instance)) {
+ err = PTR_ERR(instance);
+ goto out;
+ }
+ err = analyze_subprog(env, NULL, info, instance, callsites);
+ if (err)
+ goto out;
+
+ /*
+ * Subprogs and callbacks that don't receive FP-derived arguments
+ * cannot access ancestor stack frames, so they were skipped during
+ * the recursive walk above. Async callbacks (timer, workqueue) are
+ * also not reachable from the main program's call graph. Analyze
+ * all unvisited subprogs as independent roots at depth 0.
+ *
+ * Use reverse topological order (callers before callees) so that
+ * each subprog is analyzed before its callees, allowing the
+ * recursive walk inside analyze_subprog() to naturally
+ * reach nested callees that also lack FP-derived args.
+ */
+ for (k = env->subprog_cnt - 1; k >= 0; k--) {
+ int sub = env->subprog_topo_order[k];
+
+ if (info[sub].at_in && !bpf_subprog_is_global(env, sub))
+ continue;
+ instance = call_instance(env, NULL, 0, sub);
+ if (IS_ERR(instance)) {
+ err = PTR_ERR(instance);
+ goto out;
+ }
+ err = analyze_subprog(env, NULL, info, instance, callsites);
+ if (err)
+ goto out;
+ }
+
+ if (env->log.level & BPF_LOG_LEVEL2)
+ err = print_instances(env);
+
+out:
+ for (k = 0; k < insn_cnt; k++)
+ kvfree(env->callsite_at_stack[k]);
+ kvfree(env->callsite_at_stack);
+ env->callsite_at_stack = NULL;
+ for (k = 0; k < env->subprog_cnt; k++)
+ kvfree(info[k].at_in);
+ kvfree(info);
+ return err;
+}
+
+/* Each field is a register bitmask */
+struct insn_live_regs {
+ u16 use; /* registers read by instruction */
+ u16 def; /* registers written by instruction */
+ u16 in; /* registers that may be alive before instruction */
+ u16 out; /* registers that may be alive after instruction */
+};
+
+/* Bitmask with 1s for all caller saved registers */
+#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1)
+
+/* Compute info->{use,def} fields for the instruction */
+static void compute_insn_live_regs(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct insn_live_regs *info)
+{
+ struct bpf_call_summary cs;
+ u8 class = BPF_CLASS(insn->code);
+ u8 code = BPF_OP(insn->code);
+ u8 mode = BPF_MODE(insn->code);
+ u16 src = BIT(insn->src_reg);
+ u16 dst = BIT(insn->dst_reg);
+ u16 r0 = BIT(0);
+ u16 def = 0;
+ u16 use = 0xffff;
+
+ switch (class) {
+ case BPF_LD:
+ switch (mode) {
+ case BPF_IMM:
+ if (BPF_SIZE(insn->code) == BPF_DW) {
+ def = dst;
+ use = 0;
+ }
+ break;
+ case BPF_LD | BPF_ABS:
+ case BPF_LD | BPF_IND:
+ /* stick with defaults */
+ break;
+ }
+ break;
+ case BPF_LDX:
+ switch (mode) {
+ case BPF_MEM:
+ case BPF_MEMSX:
+ def = dst;
+ use = src;
+ break;
+ }
+ break;
+ case BPF_ST:
+ switch (mode) {
+ case BPF_MEM:
+ def = 0;
+ use = dst;
+ break;
+ }
+ break;
+ case BPF_STX:
+ switch (mode) {
+ case BPF_MEM:
+ def = 0;
+ use = dst | src;
+ break;
+ case BPF_ATOMIC:
+ switch (insn->imm) {
+ case BPF_CMPXCHG:
+ use = r0 | dst | src;
+ def = r0;
+ break;
+ case BPF_LOAD_ACQ:
+ def = dst;
+ use = src;
+ break;
+ case BPF_STORE_REL:
+ def = 0;
+ use = dst | src;
+ break;
+ default:
+ use = dst | src;
+ if (insn->imm & BPF_FETCH)
+ def = src;
+ else
+ def = 0;
+ }
+ break;
+ }
+ break;
+ case BPF_ALU:
+ case BPF_ALU64:
+ switch (code) {
+ case BPF_END:
+ use = dst;
+ def = dst;
+ break;
+ case BPF_MOV:
+ def = dst;
+ if (BPF_SRC(insn->code) == BPF_K)
+ use = 0;
+ else
+ use = src;
+ break;
+ default:
+ def = dst;
+ if (BPF_SRC(insn->code) == BPF_K)
+ use = dst;
+ else
+ use = dst | src;
+ }
+ break;
+ case BPF_JMP:
+ case BPF_JMP32:
+ switch (code) {
+ case BPF_JA:
+ def = 0;
+ if (BPF_SRC(insn->code) == BPF_X)
+ use = dst;
+ else
+ use = 0;
+ break;
+ case BPF_JCOND:
+ def = 0;
+ use = 0;
+ break;
+ case BPF_EXIT:
+ def = 0;
+ use = r0;
+ break;
+ case BPF_CALL:
+ def = ALL_CALLER_SAVED_REGS;
+ use = def & ~BIT(BPF_REG_0);
+ if (bpf_get_call_summary(env, insn, &cs))
+ use = GENMASK(cs.num_params, 1);
+ break;
+ default:
+ def = 0;
+ if (BPF_SRC(insn->code) == BPF_K)
+ use = dst;
+ else
+ use = dst | src;
+ }
+ break;
+ }
+
+ info->def = def;
+ info->use = use;
+}
+
+/* Compute may-live registers after each instruction in the program.
+ * The register is live after the instruction I if it is read by some
+ * instruction S following I during program execution and is not
+ * overwritten between I and S.
+ *
+ * Store result in env->insn_aux_data[i].live_regs.
+ */
+int bpf_compute_live_registers(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *insn_aux = env->insn_aux_data;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct insn_live_regs *state;
+ int insn_cnt = env->prog->len;
+ int err = 0, i, j;
+ bool changed;
+
+ /* Use the following algorithm:
+ * - define the following:
+ * - I.use : a set of all registers read by instruction I;
+ * - I.def : a set of all registers written by instruction I;
+ * - I.in : a set of all registers that may be alive before I execution;
+ * - I.out : a set of all registers that may be alive after I execution;
+ * - insn_successors(I): a set of instructions S that might immediately
+ * follow I for some program execution;
+ * - associate separate empty sets 'I.in' and 'I.out' with each instruction;
+ * - visit each instruction in a postorder and update
+ * state[i].in, state[i].out as follows:
+ *
+ * state[i].out = U [state[s].in for S in insn_successors(i)]
+ * state[i].in = (state[i].out / state[i].def) U state[i].use
+ *
+ * (where U stands for set union, / stands for set difference)
+ * - repeat the computation while {in,out} fields changes for
+ * any instruction.
+ */
+ state = kvzalloc_objs(*state, insn_cnt, GFP_KERNEL_ACCOUNT);
+ if (!state) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < insn_cnt; ++i)
+ compute_insn_live_regs(env, &insns[i], &state[i]);
+
+ /* Forward pass: resolve stack access through FP-derived pointers */
+ err = bpf_compute_subprog_arg_access(env);
+ if (err)
+ goto out;
+
+ changed = true;
+ while (changed) {
+ changed = false;
+ for (i = 0; i < env->cfg.cur_postorder; ++i) {
+ int insn_idx = env->cfg.insn_postorder[i];
+ struct insn_live_regs *live = &state[insn_idx];
+ struct bpf_iarray *succ;
+ u16 new_out = 0;
+ u16 new_in = 0;
+
+ succ = bpf_insn_successors(env, insn_idx);
+ for (int s = 0; s < succ->cnt; ++s)
+ new_out |= state[succ->items[s]].in;
+ new_in = (new_out & ~live->def) | live->use;
+ if (new_out != live->out || new_in != live->in) {
+ live->in = new_in;
+ live->out = new_out;
+ changed = true;
+ }
+ }
+ }
+
+ for (i = 0; i < insn_cnt; ++i)
+ insn_aux[i].live_regs_before = state[i].in;
+
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "Live regs before insn:\n");
+ for (i = 0; i < insn_cnt; ++i) {
+ if (env->insn_aux_data[i].scc)
+ verbose(env, "%3d ", env->insn_aux_data[i].scc);
+ else
+ verbose(env, " ");
+ verbose(env, "%3d: ", i);
+ for (j = BPF_REG_0; j < BPF_REG_10; ++j)
+ if (insn_aux[i].live_regs_before & BIT(j))
+ verbose(env, "%d", j);
+ else
+ verbose(env, ".");
+ verbose(env, " ");
+ bpf_verbose_insn(env, &insns[i]);
+ if (bpf_is_ldimm64(&insns[i]))
+ i++;
+ }
+ }
+
+out:
+ kvfree(state);
+ return err;
+}
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 8fca0c64f7b1..23267213a17f 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -270,7 +270,7 @@ static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key,
goto enoent;
storage = list_next_entry(storage, list_map);
- if (!storage)
+ if (list_entry_is_head(storage, &map->list, list_map))
goto enoent;
} else {
storage = list_first_entry(&map->list,
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index a0c3b35de2ce..011e4ec25acd 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -329,47 +329,6 @@ __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
}
EXPORT_SYMBOL_GPL(bpf_log);
-static const struct bpf_line_info *
-find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
-{
- const struct bpf_line_info *linfo;
- const struct bpf_prog *prog;
- u32 nr_linfo;
- int l, r, m;
-
- prog = env->prog;
- nr_linfo = prog->aux->nr_linfo;
-
- if (!nr_linfo || insn_off >= prog->len)
- return NULL;
-
- linfo = prog->aux->linfo;
- /* Loop invariant: linfo[l].insn_off <= insns_off.
- * linfo[0].insn_off == 0 which always satisfies above condition.
- * Binary search is searching for rightmost linfo entry that satisfies
- * the above invariant, giving us the desired record that covers given
- * instruction offset.
- */
- l = 0;
- r = nr_linfo - 1;
- while (l < r) {
- /* (r - l + 1) / 2 means we break a tie to the right, so if:
- * l=1, r=2, linfo[l].insn_off <= insn_off, linfo[r].insn_off > insn_off,
- * then m=2, we see that linfo[m].insn_off > insn_off, and so
- * r becomes 1 and we exit the loop with correct l==1.
- * If the tie was broken to the left, m=1 would end us up in
- * an endless loop where l and m stay at 1 and r stays at 2.
- */
- m = l + (r - l + 1) / 2;
- if (linfo[m].insn_off <= insn_off)
- l = m;
- else
- r = m - 1;
- }
-
- return &linfo[l];
-}
-
static const char *ltrim(const char *s)
{
while (isspace(*s))
@@ -390,7 +349,7 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
return;
prev_linfo = env->prev_linfo;
- linfo = find_linfo(env, insn_off);
+ linfo = bpf_find_linfo(env->prog, insn_off);
if (!linfo || linfo == prev_linfo)
return;
@@ -542,7 +501,8 @@ static char slot_type_char[] = {
[STACK_ZERO] = '0',
[STACK_DYNPTR] = 'd',
[STACK_ITER] = 'i',
- [STACK_IRQ_FLAG] = 'f'
+ [STACK_IRQ_FLAG] = 'f',
+ [STACK_POISON] = 'p',
};
#define UNUM_MAX_DECIMAL U16_MAX
@@ -581,6 +541,8 @@ int tnum_strn(char *str, size_t size, struct tnum a)
if (a.mask == 0) {
if (is_unum_decimal(a.value))
return snprintf(str, size, "%llu", a.value);
+ if (is_snum_decimal(a.value))
+ return snprintf(str, size, "%lld", a.value);
else
return snprintf(str, size, "%#llx", a.value);
}
@@ -692,7 +654,7 @@ static void print_reg_state(struct bpf_verifier_env *env,
if (state->frameno != reg->frameno)
verbose(env, "[%d]", reg->frameno);
if (tnum_is_const(reg->var_off)) {
- verbose_snum(env, reg->var_off.value + reg->off);
+ verbose_snum(env, reg->var_off.value + reg->delta);
return;
}
}
@@ -702,7 +664,7 @@ static void print_reg_state(struct bpf_verifier_env *env,
if (reg->id)
verbose_a("id=%d", reg->id & ~BPF_ADD_CONST);
if (reg->id & BPF_ADD_CONST)
- verbose(env, "%+d", reg->off);
+ verbose(env, "%+d", reg->delta);
if (reg->ref_obj_id)
verbose_a("ref_obj_id=%d", reg->ref_obj_id);
if (type_is_non_owning_ref(reg->type))
@@ -714,9 +676,9 @@ static void print_reg_state(struct bpf_verifier_env *env,
reg->map_ptr->key_size,
reg->map_ptr->value_size);
}
- if (t != SCALAR_VALUE && reg->off) {
+ if (t != SCALAR_VALUE && reg->delta) {
verbose_a("off=");
- verbose_snum(env, reg->off);
+ verbose_snum(env, reg->delta);
}
if (type_is_pkt_pointer(t)) {
verbose_a("r=");
@@ -777,7 +739,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
for (j = 0; j < BPF_REG_SIZE; j++) {
slot_type = state->stack[i].slot_type[j];
- if (slot_type != STACK_INVALID)
+ if (slot_type != STACK_INVALID && slot_type != STACK_POISON)
valid = true;
types_buf[j] = slot_type_char[slot_type];
}
@@ -845,7 +807,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
mark_verifier_state_clean(env);
}
-static inline u32 vlog_alignment(u32 pos)
+u32 bpf_vlog_alignment(u32 pos)
{
return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
BPF_LOG_MIN_ALIGNMENT) - pos - 1;
@@ -857,7 +819,7 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st
if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
/* remove new line character */
bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
- verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
+ verbose(env, "%*c;", bpf_vlog_alignment(env->prev_insn_print_pos), ' ');
} else {
verbose(env, "%d:", env->insn_idx);
}
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 682a9f34214b..e9662db7198f 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -284,17 +284,6 @@ static void __free_rcu(struct rcu_head *head)
atomic_set(&c->call_rcu_ttrace_in_progress, 0);
}
-static void __free_rcu_tasks_trace(struct rcu_head *head)
-{
- /* If RCU Tasks Trace grace period implies RCU grace period,
- * there is no need to invoke call_rcu().
- */
- if (rcu_trace_implies_rcu_gp())
- __free_rcu(head);
- else
- call_rcu(head, __free_rcu);
-}
-
static void enque_to_free(struct bpf_mem_cache *c, void *obj)
{
struct llist_node *llnode = obj;
@@ -326,12 +315,12 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
return;
}
- /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
- * If RCU Tasks Trace grace period implies RCU grace period, free
- * these elements directly, else use call_rcu() to wait for normal
- * progs to finish and finally do free_one() on each element.
+ /*
+ * Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
+ * RCU Tasks Trace grace period implies RCU grace period, so pass
+ * __free_rcu directly as the callback.
*/
- call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace);
+ call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu);
}
static void free_bulk(struct bpf_mem_cache *c)
@@ -696,20 +685,18 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
static void free_mem_alloc(struct bpf_mem_alloc *ma)
{
- /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
+ /*
+ * waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
* might still execute. Wait for them.
*
* rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
* but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
- * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(),
- * so if call_rcu(head, __free_rcu) is skipped due to
- * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
- * using rcu_trace_implies_rcu_gp() as well.
+ * to wait for the pending __free_by_rcu(), and __free_rcu(). RCU Tasks
+ * Trace grace period implies RCU grace period, so all __free_rcu don't
+ * need extra call_rcu() (and thus extra rcu_barrier() here).
*/
rcu_barrier(); /* wait for __free_by_rcu */
rcu_barrier_tasks_trace(); /* wait for __free_rcu */
- if (!rcu_trace_implies_rcu_gp())
- rcu_barrier();
free_mem_alloc_no_barrier(ma);
}
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 0ad97d643bf4..0d6f5569588c 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -435,9 +435,8 @@ static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data)
if (aux->offload) {
args->info->ifindex = aux->offload->netdev->ifindex;
- net = dev_net(aux->offload->netdev);
- get_net(net);
- ns = &net->ns;
+ net = maybe_get_net(dev_net(aux->offload->netdev));
+ ns = net ? &net->ns : NULL;
} else {
args->info->ifindex = 0;
ns = NULL;
@@ -647,9 +646,8 @@ static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data)
if (args->offmap->netdev) {
args->info->ifindex = args->offmap->netdev->ifindex;
- net = dev_net(args->offmap->netdev);
- get_net(net);
- ns = &net->ns;
+ net = maybe_get_net(dev_net(args->offmap->netdev));
+ ns = net ? &net->ns : NULL;
} else {
args->info->ifindex = 0;
ns = NULL;
diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c
new file mode 100644
index 000000000000..8478d2c6ed5b
--- /dev/null
+++ b/kernel/bpf/states.c
@@ -0,0 +1,1563 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/filter.h>
+
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
+
+#define BPF_COMPLEXITY_LIMIT_STATES 64
+
+static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx)
+{
+ return bpf_is_may_goto_insn(&env->prog->insnsi[insn_idx]);
+}
+
+static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].is_iter_next;
+}
+
+static void update_peak_states(struct bpf_verifier_env *env)
+{
+ u32 cur_states;
+
+ cur_states = env->explored_states_size + env->free_list_size + env->num_backedges;
+ env->peak_states = max(env->peak_states, cur_states);
+}
+
+/* struct bpf_verifier_state->parent refers to states
+ * that are in either of env->{expored_states,free_list}.
+ * In both cases the state is contained in struct bpf_verifier_state_list.
+ */
+static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_state *st)
+{
+ if (st->parent)
+ return container_of(st->parent, struct bpf_verifier_state_list, state);
+ return NULL;
+}
+
+static bool incomplete_read_marks(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st);
+
+/* A state can be freed if it is no longer referenced:
+ * - is in the env->free_list;
+ * - has no children states;
+ */
+static void maybe_free_verifier_state(struct bpf_verifier_env *env,
+ struct bpf_verifier_state_list *sl)
+{
+ if (!sl->in_free_list
+ || sl->state.branches != 0
+ || incomplete_read_marks(env, &sl->state))
+ return;
+ list_del(&sl->node);
+ bpf_free_verifier_state(&sl->state, false);
+ kfree(sl);
+ env->free_list_size--;
+}
+
+/* For state @st look for a topmost frame with frame_insn_idx() in some SCC,
+ * if such frame exists form a corresponding @callchain as an array of
+ * call sites leading to this frame and SCC id.
+ * E.g.:
+ *
+ * void foo() { A: loop {... SCC#1 ...}; }
+ * void bar() { B: loop { C: foo(); ... SCC#2 ... }
+ * D: loop { E: foo(); ... SCC#3 ... } }
+ * void main() { F: bar(); }
+ *
+ * @callchain at (A) would be either (F,SCC#2) or (F,SCC#3) depending
+ * on @st frame call sites being (F,C,A) or (F,E,A).
+ */
+static bool compute_scc_callchain(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st,
+ struct bpf_scc_callchain *callchain)
+{
+ u32 i, scc, insn_idx;
+
+ memset(callchain, 0, sizeof(*callchain));
+ for (i = 0; i <= st->curframe; i++) {
+ insn_idx = bpf_frame_insn_idx(st, i);
+ scc = env->insn_aux_data[insn_idx].scc;
+ if (scc) {
+ callchain->scc = scc;
+ break;
+ } else if (i < st->curframe) {
+ callchain->callsites[i] = insn_idx;
+ } else {
+ return false;
+ }
+ }
+ return true;
+}
+
+/* Check if bpf_scc_visit instance for @callchain exists. */
+static struct bpf_scc_visit *scc_visit_lookup(struct bpf_verifier_env *env,
+ struct bpf_scc_callchain *callchain)
+{
+ struct bpf_scc_info *info = env->scc_info[callchain->scc];
+ struct bpf_scc_visit *visits = info->visits;
+ u32 i;
+
+ if (!info)
+ return NULL;
+ for (i = 0; i < info->num_visits; i++)
+ if (memcmp(callchain, &visits[i].callchain, sizeof(*callchain)) == 0)
+ return &visits[i];
+ return NULL;
+}
+
+/* Allocate a new bpf_scc_visit instance corresponding to @callchain.
+ * Allocated instances are alive for a duration of the do_check_common()
+ * call and are freed by free_states().
+ */
+static struct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env,
+ struct bpf_scc_callchain *callchain)
+{
+ struct bpf_scc_visit *visit;
+ struct bpf_scc_info *info;
+ u32 scc, num_visits;
+ u64 new_sz;
+
+ scc = callchain->scc;
+ info = env->scc_info[scc];
+ num_visits = info ? info->num_visits : 0;
+ new_sz = sizeof(*info) + sizeof(struct bpf_scc_visit) * (num_visits + 1);
+ info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL_ACCOUNT);
+ if (!info)
+ return NULL;
+ env->scc_info[scc] = info;
+ info->num_visits = num_visits + 1;
+ visit = &info->visits[num_visits];
+ memset(visit, 0, sizeof(*visit));
+ memcpy(&visit->callchain, callchain, sizeof(*callchain));
+ return visit;
+}
+
+/* Form a string '(callsite#1,callsite#2,...,scc)' in env->tmp_str_buf */
+static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain)
+{
+ char *buf = env->tmp_str_buf;
+ int i, delta = 0;
+
+ delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "(");
+ for (i = 0; i < ARRAY_SIZE(callchain->callsites); i++) {
+ if (!callchain->callsites[i])
+ break;
+ delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u,",
+ callchain->callsites[i]);
+ }
+ delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u)", callchain->scc);
+ return env->tmp_str_buf;
+}
+
+/* If callchain for @st exists (@st is in some SCC), ensure that
+ * bpf_scc_visit instance for this callchain exists.
+ * If instance does not exist or is empty, assign visit->entry_state to @st.
+ */
+static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain))
+ return 0;
+ visit = scc_visit_lookup(env, callchain);
+ visit = visit ?: scc_visit_alloc(env, callchain);
+ if (!visit)
+ return -ENOMEM;
+ if (!visit->entry_state) {
+ visit->entry_state = st;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "SCC enter %s\n", format_callchain(env, callchain));
+ }
+ return 0;
+}
+
+static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit);
+
+/* If callchain for @st exists (@st is in some SCC), make it empty:
+ * - set visit->entry_state to NULL;
+ * - flush accumulated backedges.
+ */
+static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain))
+ return 0;
+ visit = scc_visit_lookup(env, callchain);
+ if (!visit) {
+ /*
+ * If path traversal stops inside an SCC, corresponding bpf_scc_visit
+ * must exist for non-speculative paths. For non-speculative paths
+ * traversal stops when:
+ * a. Verification error is found, maybe_exit_scc() is not called.
+ * b. Top level BPF_EXIT is reached. Top level BPF_EXIT is not a member
+ * of any SCC.
+ * c. A checkpoint is reached and matched. Checkpoints are created by
+ * is_state_visited(), which calls maybe_enter_scc(), which allocates
+ * bpf_scc_visit instances for checkpoints within SCCs.
+ * (c) is the only case that can reach this point.
+ */
+ if (!st->speculative) {
+ verifier_bug(env, "scc exit: no visit info for call chain %s",
+ format_callchain(env, callchain));
+ return -EFAULT;
+ }
+ return 0;
+ }
+ if (visit->entry_state != st)
+ return 0;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "SCC exit %s\n", format_callchain(env, callchain));
+ visit->entry_state = NULL;
+ env->num_backedges -= visit->num_backedges;
+ visit->num_backedges = 0;
+ update_peak_states(env);
+ return propagate_backedges(env, visit);
+}
+
+/* Lookup an bpf_scc_visit instance corresponding to @st callchain
+ * and add @backedge to visit->backedges. @st callchain must exist.
+ */
+static int add_scc_backedge(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st,
+ struct bpf_scc_backedge *backedge)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain)) {
+ verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d",
+ st->insn_idx);
+ return -EFAULT;
+ }
+ visit = scc_visit_lookup(env, callchain);
+ if (!visit) {
+ verifier_bug(env, "add backedge: no visit info for call chain %s",
+ format_callchain(env, callchain));
+ return -EFAULT;
+ }
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "SCC backedge %s\n", format_callchain(env, callchain));
+ backedge->next = visit->backedges;
+ visit->backedges = backedge;
+ visit->num_backedges++;
+ env->num_backedges++;
+ update_peak_states(env);
+ return 0;
+}
+
+/* bpf_reg_state->live marks for registers in a state @st are incomplete,
+ * if state @st is in some SCC and not all execution paths starting at this
+ * SCC are fully explored.
+ */
+static bool incomplete_read_marks(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain))
+ return false;
+ visit = scc_visit_lookup(env, callchain);
+ if (!visit)
+ return false;
+ return !!visit->backedges;
+}
+
+int bpf_update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_verifier_state_list *sl = NULL, *parent_sl;
+ struct bpf_verifier_state *parent;
+ int err;
+
+ while (st) {
+ u32 br = --st->branches;
+
+ /* verifier_bug_if(br > 1, ...) technically makes sense here,
+ * but see comment in push_stack(), hence:
+ */
+ verifier_bug_if((int)br < 0, env, "%s:branches_to_explore=%d", __func__, br);
+ if (br)
+ break;
+ err = maybe_exit_scc(env, st);
+ if (err)
+ return err;
+ parent = st->parent;
+ parent_sl = state_parent_as_list(st);
+ if (sl)
+ maybe_free_verifier_state(env, sl);
+ st = parent;
+ sl = parent_sl;
+ }
+ return 0;
+}
+
+/* check %cur's range satisfies %old's */
+static bool range_within(const struct bpf_reg_state *old,
+ const struct bpf_reg_state *cur)
+{
+ return old->umin_value <= cur->umin_value &&
+ old->umax_value >= cur->umax_value &&
+ old->smin_value <= cur->smin_value &&
+ old->smax_value >= cur->smax_value &&
+ old->u32_min_value <= cur->u32_min_value &&
+ old->u32_max_value >= cur->u32_max_value &&
+ old->s32_min_value <= cur->s32_min_value &&
+ old->s32_max_value >= cur->s32_max_value;
+}
+
+/* If in the old state two registers had the same id, then they need to have
+ * the same id in the new state as well. But that id could be different from
+ * the old state, so we need to track the mapping from old to new ids.
+ * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent
+ * regs with old id 5 must also have new id 9 for the new state to be safe. But
+ * regs with a different old id could still have new id 9, we don't care about
+ * that.
+ * So we look through our idmap to see if this old id has been seen before. If
+ * so, we require the new id to match; otherwise, we add the id pair to the map.
+ */
+static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
+{
+ struct bpf_id_pair *map = idmap->map;
+ unsigned int i;
+
+ /* either both IDs should be set or both should be zero */
+ if (!!old_id != !!cur_id)
+ return false;
+
+ if (old_id == 0) /* cur_id == 0 as well */
+ return true;
+
+ for (i = 0; i < idmap->cnt; i++) {
+ if (map[i].old == old_id)
+ return map[i].cur == cur_id;
+ if (map[i].cur == cur_id)
+ return false;
+ }
+
+ /* Reached the end of known mappings; haven't seen this id before */
+ if (idmap->cnt < BPF_ID_MAP_SIZE) {
+ map[idmap->cnt].old = old_id;
+ map[idmap->cnt].cur = cur_id;
+ idmap->cnt++;
+ return true;
+ }
+
+ /* We ran out of idmap slots, which should be impossible */
+ WARN_ON_ONCE(1);
+ return false;
+}
+
+/*
+ * Compare scalar register IDs for state equivalence.
+ *
+ * When old_id == 0, the old register is independent - not linked to any
+ * other register. Any linking in the current state only adds constraints,
+ * making it more restrictive. Since the old state didn't rely on any ID
+ * relationships for this register, it's always safe to accept cur regardless
+ * of its ID. Hence, return true immediately.
+ *
+ * When old_id != 0 but cur_id == 0, we need to ensure that different
+ * independent registers in cur don't incorrectly satisfy the ID matching
+ * requirements of linked registers in old.
+ *
+ * Example: if old has r6.id=X and r7.id=X (linked), but cur has r6.id=0
+ * and r7.id=0 (both independent), without temp IDs both would map old_id=X
+ * to cur_id=0 and pass. With temp IDs: r6 maps X->temp1, r7 tries to map
+ * X->temp2, but X is already mapped to temp1, so the check fails correctly.
+ *
+ * When old_id has BPF_ADD_CONST set, the compound id (base | flag) and the
+ * base id (flag stripped) must both map consistently. Example: old has
+ * r2.id=A, r3.id=A|flag (r3 = r2 + delta), cur has r2.id=B, r3.id=C|flag
+ * (r3 derived from unrelated r4). Without the base check, idmap gets two
+ * independent entries A->B and A|flag->C|flag, missing that A->C conflicts
+ * with A->B. The base ID cross-check catches this.
+ */
+static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
+{
+ if (!old_id)
+ return true;
+
+ cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen;
+
+ if (!check_ids(old_id, cur_id, idmap))
+ return false;
+ if (old_id & BPF_ADD_CONST) {
+ old_id &= ~BPF_ADD_CONST;
+ cur_id &= ~BPF_ADD_CONST;
+ if (!check_ids(old_id, cur_id, idmap))
+ return false;
+ }
+ return true;
+}
+
+static void __clean_func_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *st,
+ u16 live_regs, int frame)
+{
+ int i, j;
+
+ for (i = 0; i < BPF_REG_FP; i++) {
+ /* liveness must not touch this register anymore */
+ if (!(live_regs & BIT(i)))
+ /* since the register is unused, clear its state
+ * to make further comparison simpler
+ */
+ bpf_mark_reg_not_init(env, &st->regs[i]);
+ }
+
+ /*
+ * Clean dead 4-byte halves within each SPI independently.
+ * half_spi 2*i → lower half: slot_type[0..3] (closer to FP)
+ * half_spi 2*i+1 → upper half: slot_type[4..7] (farther from FP)
+ */
+ for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
+ bool lo_live = bpf_stack_slot_alive(env, frame, i * 2);
+ bool hi_live = bpf_stack_slot_alive(env, frame, i * 2 + 1);
+
+ if (!hi_live || !lo_live) {
+ int start = !lo_live ? 0 : BPF_REG_SIZE / 2;
+ int end = !hi_live ? BPF_REG_SIZE : BPF_REG_SIZE / 2;
+ u8 stype = st->stack[i].slot_type[7];
+
+ /*
+ * Don't clear special slots.
+ * destroy_if_dynptr_stack_slot() needs STACK_DYNPTR to
+ * detect overwrites and invalidate associated data slices.
+ * is_iter_reg_valid_uninit() and is_irq_flag_reg_valid_uninit()
+ * check for their respective slot types to detect double-create.
+ */
+ if (stype == STACK_DYNPTR || stype == STACK_ITER ||
+ stype == STACK_IRQ_FLAG)
+ continue;
+
+ /*
+ * Only destroy spilled_ptr when hi half is dead.
+ * If hi half is still live with STACK_SPILL, the
+ * spilled_ptr metadata is needed for correct state
+ * comparison in stacksafe().
+ * is_spilled_reg() is using slot_type[7], but
+ * is_spilled_scalar_after() check either slot_type[0] or [4]
+ */
+ if (!hi_live) {
+ struct bpf_reg_state *spill = &st->stack[i].spilled_ptr;
+
+ if (lo_live && stype == STACK_SPILL) {
+ u8 val = STACK_MISC;
+
+ /*
+ * 8 byte spill of scalar 0 where half slot is dead
+ * should become STACK_ZERO in lo 4 bytes.
+ */
+ if (bpf_register_is_null(spill))
+ val = STACK_ZERO;
+ for (j = 0; j < 4; j++) {
+ u8 *t = &st->stack[i].slot_type[j];
+
+ if (*t == STACK_SPILL)
+ *t = val;
+ }
+ }
+ bpf_mark_reg_not_init(env, spill);
+ }
+ for (j = start; j < end; j++)
+ st->stack[i].slot_type[j] = STACK_POISON;
+ }
+ }
+}
+
+static int clean_verifier_state(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
+{
+ int i, err;
+
+ err = bpf_live_stack_query_init(env, st);
+ if (err)
+ return err;
+ for (i = 0; i <= st->curframe; i++) {
+ u32 ip = bpf_frame_insn_idx(st, i);
+ u16 live_regs = env->insn_aux_data[ip].live_regs_before;
+
+ __clean_func_state(env, st->frame[i], live_regs, i);
+ }
+ return 0;
+}
+
+static bool regs_exact(const struct bpf_reg_state *rold,
+ const struct bpf_reg_state *rcur,
+ struct bpf_idmap *idmap)
+{
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ check_ids(rold->id, rcur->id, idmap) &&
+ check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
+}
+
+enum exact_level {
+ NOT_EXACT,
+ EXACT,
+ RANGE_WITHIN
+};
+
+/* Returns true if (rold safe implies rcur safe) */
+static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
+ struct bpf_reg_state *rcur, struct bpf_idmap *idmap,
+ enum exact_level exact)
+{
+ if (exact == EXACT)
+ return regs_exact(rold, rcur, idmap);
+
+ if (rold->type == NOT_INIT)
+ /* explored state can't have used this */
+ return true;
+
+ /* Enforce that register types have to match exactly, including their
+ * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general
+ * rule.
+ *
+ * One can make a point that using a pointer register as unbounded
+ * SCALAR would be technically acceptable, but this could lead to
+ * pointer leaks because scalars are allowed to leak while pointers
+ * are not. We could make this safe in special cases if root is
+ * calling us, but it's probably not worth the hassle.
+ *
+ * Also, register types that are *not* MAYBE_NULL could technically be
+ * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE
+ * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point
+ * to the same map).
+ * However, if the old MAYBE_NULL register then got NULL checked,
+ * doing so could have affected others with the same id, and we can't
+ * check for that because we lost the id when we converted to
+ * a non-MAYBE_NULL variant.
+ * So, as a general rule we don't allow mixing MAYBE_NULL and
+ * non-MAYBE_NULL registers as well.
+ */
+ if (rold->type != rcur->type)
+ return false;
+
+ switch (base_type(rold->type)) {
+ case SCALAR_VALUE:
+ if (env->explore_alu_limits) {
+ /* explore_alu_limits disables tnum_in() and range_within()
+ * logic and requires everything to be strict
+ */
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ check_scalar_ids(rold->id, rcur->id, idmap);
+ }
+ if (!rold->precise && exact == NOT_EXACT)
+ return true;
+ /*
+ * Linked register tracking uses rold->id to detect relationships.
+ * When rold->id == 0, the register is independent and any linking
+ * in rcur only adds constraints. When rold->id != 0, we must verify
+ * id mapping and (for BPF_ADD_CONST) offset consistency.
+ *
+ * +------------------+-----------+------------------+---------------+
+ * | | rold->id | rold + ADD_CONST | rold->id == 0 |
+ * |------------------+-----------+------------------+---------------|
+ * | rcur->id | range,ids | false | range |
+ * | rcur + ADD_CONST | false | range,ids,off | range |
+ * | rcur->id == 0 | range,ids | false | range |
+ * +------------------+-----------+------------------+---------------+
+ *
+ * Why check_ids() for scalar registers?
+ *
+ * Consider the following BPF code:
+ * 1: r6 = ... unbound scalar, ID=a ...
+ * 2: r7 = ... unbound scalar, ID=b ...
+ * 3: if (r6 > r7) goto +1
+ * 4: r6 = r7
+ * 5: if (r6 > X) goto ...
+ * 6: ... memory operation using r7 ...
+ *
+ * First verification path is [1-6]:
+ * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7;
+ * - at (5) r6 would be marked <= X, sync_linked_regs() would also mark
+ * r7 <= X, because r6 and r7 share same id.
+ * Next verification path is [1-4, 6].
+ *
+ * Instruction (6) would be reached in two states:
+ * I. r6{.id=b}, r7{.id=b} via path 1-6;
+ * II. r6{.id=a}, r7{.id=b} via path 1-4, 6.
+ *
+ * Use check_ids() to distinguish these states.
+ * ---
+ * Also verify that new value satisfies old value range knowledge.
+ */
+
+ /*
+ * ADD_CONST flags must match exactly: BPF_ADD_CONST32 and
+ * BPF_ADD_CONST64 have different linking semantics in
+ * sync_linked_regs() (alu32 zero-extends, alu64 does not),
+ * so pruning across different flag types is unsafe.
+ */
+ if (rold->id &&
+ (rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST))
+ return false;
+
+ /* Both have offset linkage: offsets must match */
+ if ((rold->id & BPF_ADD_CONST) && rold->delta != rcur->delta)
+ return false;
+
+ if (!check_scalar_ids(rold->id, rcur->id, idmap))
+ return false;
+
+ return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off);
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_MEM:
+ case PTR_TO_BUF:
+ case PTR_TO_TP_BUFFER:
+ /* If the new min/max/var_off satisfy the old ones and
+ * everything else matches, we are OK.
+ */
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
+ range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off) &&
+ check_ids(rold->id, rcur->id, idmap) &&
+ check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
+ case PTR_TO_PACKET_META:
+ case PTR_TO_PACKET:
+ /* We must have at least as much range as the old ptr
+ * did, so that any accesses which were safe before are
+ * still safe. This is true even if old range < old off,
+ * since someone could have accessed through (ptr - k), or
+ * even done ptr -= k in a register, to get a safe access.
+ */
+ if (rold->range < 0 || rcur->range < 0) {
+ /* special case for [BEYOND|AT]_PKT_END */
+ if (rold->range != rcur->range)
+ return false;
+ } else if (rold->range > rcur->range) {
+ return false;
+ }
+ /* id relations must be preserved */
+ if (!check_ids(rold->id, rcur->id, idmap))
+ return false;
+ /* new val must satisfy old val knowledge */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ case PTR_TO_STACK:
+ /* two stack pointers are equal only if they're pointing to
+ * the same stack frame, since fp-8 in foo != fp-8 in bar
+ */
+ return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
+ case PTR_TO_ARENA:
+ return true;
+ case PTR_TO_INSN:
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
+ range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off);
+ default:
+ return regs_exact(rold, rcur, idmap);
+ }
+}
+
+static struct bpf_reg_state unbound_reg;
+
+static __init int unbound_reg_init(void)
+{
+ bpf_mark_reg_unknown_imprecise(&unbound_reg);
+ return 0;
+}
+late_initcall(unbound_reg_init);
+
+static bool is_spilled_scalar_after(const struct bpf_stack_state *stack, int im)
+{
+ return stack->slot_type[im] == STACK_SPILL &&
+ stack->spilled_ptr.type == SCALAR_VALUE;
+}
+
+static bool is_stack_misc_after(struct bpf_verifier_env *env,
+ struct bpf_stack_state *stack, int im)
+{
+ u32 i;
+
+ for (i = im; i < ARRAY_SIZE(stack->slot_type); ++i) {
+ if ((stack->slot_type[i] == STACK_MISC) ||
+ ((stack->slot_type[i] == STACK_INVALID || stack->slot_type[i] == STACK_POISON) &&
+ env->allow_uninit_stack))
+ continue;
+ return false;
+ }
+
+ return true;
+}
+
+static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env,
+ struct bpf_stack_state *stack, int im)
+{
+ if (is_spilled_scalar_after(stack, im))
+ return &stack->spilled_ptr;
+
+ if (is_stack_misc_after(env, stack, im))
+ return &unbound_reg;
+
+ return NULL;
+}
+
+static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
+ struct bpf_func_state *cur, struct bpf_idmap *idmap,
+ enum exact_level exact)
+{
+ int i, spi;
+
+ /* walk slots of the explored stack and ignore any additional
+ * slots in the current stack, since explored(safe) state
+ * didn't use them
+ */
+ for (i = 0; i < old->allocated_stack; i++) {
+ struct bpf_reg_state *old_reg, *cur_reg;
+ int im = i % BPF_REG_SIZE;
+
+ spi = i / BPF_REG_SIZE;
+
+ if (exact == EXACT) {
+ u8 old_type = old->stack[spi].slot_type[i % BPF_REG_SIZE];
+ u8 cur_type = i < cur->allocated_stack ?
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE] : STACK_INVALID;
+
+ /* STACK_INVALID and STACK_POISON are equivalent for pruning */
+ if (old_type == STACK_POISON)
+ old_type = STACK_INVALID;
+ if (cur_type == STACK_POISON)
+ cur_type = STACK_INVALID;
+ if (i >= cur->allocated_stack || old_type != cur_type)
+ return false;
+ }
+
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID ||
+ old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_POISON)
+ continue;
+
+ if (env->allow_uninit_stack &&
+ old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC)
+ continue;
+
+ /* explored stack has more populated slots than current stack
+ * and these slots were used
+ */
+ if (i >= cur->allocated_stack)
+ return false;
+
+ /*
+ * 64 and 32-bit scalar spills vs MISC/INVALID slots and vice versa.
+ * Load from MISC/INVALID slots produces unbound scalar.
+ * Construct a fake register for such stack and call
+ * regsafe() to ensure scalar ids are compared.
+ */
+ if (im == 0 || im == 4) {
+ old_reg = scalar_reg_for_stack(env, &old->stack[spi], im);
+ cur_reg = scalar_reg_for_stack(env, &cur->stack[spi], im);
+ if (old_reg && cur_reg) {
+ if (!regsafe(env, old_reg, cur_reg, idmap, exact))
+ return false;
+ i += (im == 0 ? BPF_REG_SIZE - 1 : 3);
+ continue;
+ }
+ }
+
+ /* if old state was safe with misc data in the stack
+ * it will be safe with zero-initialized stack.
+ * The opposite is not true
+ */
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC &&
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO)
+ continue;
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+ /* Ex: old explored (safe) state has STACK_SPILL in
+ * this stack slot, but current has STACK_MISC ->
+ * this verifier states are not equivalent,
+ * return false to continue verification of this path
+ */
+ return false;
+ if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)
+ continue;
+ /* Both old and cur are having same slot_type */
+ switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) {
+ case STACK_SPILL:
+ /* when explored and current stack slot are both storing
+ * spilled registers, check that stored pointers types
+ * are the same as well.
+ * Ex: explored safe path could have stored
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
+ * but current path has stored:
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
+ * such verifier states are not equivalent.
+ * return false to continue verification of this path
+ */
+ if (!regsafe(env, &old->stack[spi].spilled_ptr,
+ &cur->stack[spi].spilled_ptr, idmap, exact))
+ return false;
+ break;
+ case STACK_DYNPTR:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ if (old_reg->dynptr.type != cur_reg->dynptr.type ||
+ old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot ||
+ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
+ case STACK_ITER:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ /* iter.depth is not compared between states as it
+ * doesn't matter for correctness and would otherwise
+ * prevent convergence; we maintain it only to prevent
+ * infinite loop check triggering, see
+ * iter_active_depths_differ()
+ */
+ if (old_reg->iter.btf != cur_reg->iter.btf ||
+ old_reg->iter.btf_id != cur_reg->iter.btf_id ||
+ old_reg->iter.state != cur_reg->iter.state ||
+ /* ignore {old_reg,cur_reg}->iter.depth, see above */
+ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
+ case STACK_IRQ_FLAG:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) ||
+ old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class)
+ return false;
+ break;
+ case STACK_MISC:
+ case STACK_ZERO:
+ case STACK_INVALID:
+ case STACK_POISON:
+ continue;
+ /* Ensure that new unhandled slot types return false by default */
+ default:
+ return false;
+ }
+ }
+ return true;
+}
+
+static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur,
+ struct bpf_idmap *idmap)
+{
+ int i;
+
+ if (old->acquired_refs != cur->acquired_refs)
+ return false;
+
+ if (old->active_locks != cur->active_locks)
+ return false;
+
+ if (old->active_preempt_locks != cur->active_preempt_locks)
+ return false;
+
+ if (old->active_rcu_locks != cur->active_rcu_locks)
+ return false;
+
+ if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap))
+ return false;
+
+ if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) ||
+ old->active_lock_ptr != cur->active_lock_ptr)
+ return false;
+
+ for (i = 0; i < old->acquired_refs; i++) {
+ if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) ||
+ old->refs[i].type != cur->refs[i].type)
+ return false;
+ switch (old->refs[i].type) {
+ case REF_TYPE_PTR:
+ case REF_TYPE_IRQ:
+ break;
+ case REF_TYPE_LOCK:
+ case REF_TYPE_RES_LOCK:
+ case REF_TYPE_RES_LOCK_IRQ:
+ if (old->refs[i].ptr != cur->refs[i].ptr)
+ return false;
+ break;
+ default:
+ WARN_ONCE(1, "Unhandled enum type for reference state: %d\n", old->refs[i].type);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/* compare two verifier states
+ *
+ * all states stored in state_list are known to be valid, since
+ * verifier reached 'bpf_exit' instruction through them
+ *
+ * this function is called when verifier exploring different branches of
+ * execution popped from the state stack. If it sees an old state that has
+ * more strict register state and more strict stack state then this execution
+ * branch doesn't need to be explored further, since verifier already
+ * concluded that more strict state leads to valid finish.
+ *
+ * Therefore two states are equivalent if register state is more conservative
+ * and explored stack state is more conservative than the current one.
+ * Example:
+ * explored current
+ * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC)
+ * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC)
+ *
+ * In other words if current stack state (one being explored) has more
+ * valid slots than old one that already passed validation, it means
+ * the verifier can stop exploring and conclude that current state is valid too
+ *
+ * Similarly with registers. If explored state has register type as invalid
+ * whereas register type in current state is meaningful, it means that
+ * the current state will reach 'bpf_exit' instruction safely
+ */
+static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
+ struct bpf_func_state *cur, u32 insn_idx, enum exact_level exact)
+{
+ u16 live_regs = env->insn_aux_data[insn_idx].live_regs_before;
+ u16 i;
+
+ if (old->callback_depth > cur->callback_depth)
+ return false;
+
+ for (i = 0; i < MAX_BPF_REG; i++)
+ if (((1 << i) & live_regs) &&
+ !regsafe(env, &old->regs[i], &cur->regs[i],
+ &env->idmap_scratch, exact))
+ return false;
+
+ if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
+ return false;
+
+ return true;
+}
+
+static void reset_idmap_scratch(struct bpf_verifier_env *env)
+{
+ struct bpf_idmap *idmap = &env->idmap_scratch;
+
+ idmap->tmp_id_gen = env->id_gen;
+ idmap->cnt = 0;
+}
+
+static bool states_equal(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur,
+ enum exact_level exact)
+{
+ u32 insn_idx;
+ int i;
+
+ if (old->curframe != cur->curframe)
+ return false;
+
+ reset_idmap_scratch(env);
+
+ /* Verification state from speculative execution simulation
+ * must never prune a non-speculative execution one.
+ */
+ if (old->speculative && !cur->speculative)
+ return false;
+
+ if (old->in_sleepable != cur->in_sleepable)
+ return false;
+
+ if (!refsafe(old, cur, &env->idmap_scratch))
+ return false;
+
+ /* for states to be equal callsites have to be the same
+ * and all frame states need to be equivalent
+ */
+ for (i = 0; i <= old->curframe; i++) {
+ insn_idx = bpf_frame_insn_idx(old, i);
+ if (old->frame[i]->callsite != cur->frame[i]->callsite)
+ return false;
+ if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact))
+ return false;
+ }
+ return true;
+}
+
+/* find precise scalars in the previous equivalent state and
+ * propagate them into the current state
+ */
+static int propagate_precision(struct bpf_verifier_env *env,
+ const struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur,
+ bool *changed)
+{
+ struct bpf_reg_state *state_reg;
+ struct bpf_func_state *state;
+ int i, err = 0, fr;
+ bool first;
+
+ for (fr = old->curframe; fr >= 0; fr--) {
+ state = old->frame[fr];
+ state_reg = state->regs;
+ first = true;
+ for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ if (first)
+ verbose(env, "frame %d: propagating r%d", fr, i);
+ else
+ verbose(env, ",r%d", i);
+ }
+ bpf_bt_set_frame_reg(&env->bt, fr, i);
+ first = false;
+ }
+
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (!bpf_is_spilled_reg(&state->stack[i]))
+ continue;
+ state_reg = &state->stack[i].spilled_ptr;
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ if (first)
+ verbose(env, "frame %d: propagating fp%d",
+ fr, (-i - 1) * BPF_REG_SIZE);
+ else
+ verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE);
+ }
+ bpf_bt_set_frame_slot(&env->bt, fr, i);
+ first = false;
+ }
+ if (!first && (env->log.level & BPF_LOG_LEVEL2))
+ verbose(env, "\n");
+ }
+
+ err = bpf_mark_chain_precision(env, cur, -1, changed);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+#define MAX_BACKEDGE_ITERS 64
+
+/* Propagate read and precision marks from visit->backedges[*].state->equal_state
+ * to corresponding parent states of visit->backedges[*].state until fixed point is reached,
+ * then free visit->backedges.
+ * After execution of this function incomplete_read_marks() will return false
+ * for all states corresponding to @visit->callchain.
+ */
+static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit)
+{
+ struct bpf_scc_backedge *backedge;
+ struct bpf_verifier_state *st;
+ bool changed;
+ int i, err;
+
+ i = 0;
+ do {
+ if (i++ > MAX_BACKEDGE_ITERS) {
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "%s: too many iterations\n", __func__);
+ for (backedge = visit->backedges; backedge; backedge = backedge->next)
+ bpf_mark_all_scalars_precise(env, &backedge->state);
+ break;
+ }
+ changed = false;
+ for (backedge = visit->backedges; backedge; backedge = backedge->next) {
+ st = &backedge->state;
+ err = propagate_precision(env, st->equal_state, st, &changed);
+ if (err)
+ return err;
+ }
+ } while (changed);
+
+ bpf_free_backedges(visit);
+ return 0;
+}
+
+static bool states_maybe_looping(struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur)
+{
+ struct bpf_func_state *fold, *fcur;
+ int i, fr = cur->curframe;
+
+ if (old->curframe != fr)
+ return false;
+
+ fold = old->frame[fr];
+ fcur = cur->frame[fr];
+ for (i = 0; i < MAX_BPF_REG; i++)
+ if (memcmp(&fold->regs[i], &fcur->regs[i],
+ offsetof(struct bpf_reg_state, frameno)))
+ return false;
+ return true;
+}
+
+/* is_state_visited() handles iter_next() (see process_iter_next_call() for
+ * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
+ * states to match, which otherwise would look like an infinite loop. So while
+ * iter_next() calls are taken care of, we still need to be careful and
+ * prevent erroneous and too eager declaration of "infinite loop", when
+ * iterators are involved.
+ *
+ * Here's a situation in pseudo-BPF assembly form:
+ *
+ * 0: again: ; set up iter_next() call args
+ * 1: r1 = &it ; <CHECKPOINT HERE>
+ * 2: call bpf_iter_num_next ; this is iter_next() call
+ * 3: if r0 == 0 goto done
+ * 4: ... something useful here ...
+ * 5: goto again ; another iteration
+ * 6: done:
+ * 7: r1 = &it
+ * 8: call bpf_iter_num_destroy ; clean up iter state
+ * 9: exit
+ *
+ * This is a typical loop. Let's assume that we have a prune point at 1:,
+ * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
+ * again`, assuming other heuristics don't get in a way).
+ *
+ * When we first time come to 1:, let's say we have some state X. We proceed
+ * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
+ * Now we come back to validate that forked ACTIVE state. We proceed through
+ * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
+ * are converging. But the problem is that we don't know that yet, as this
+ * convergence has to happen at iter_next() call site only. So if nothing is
+ * done, at 1: verifier will use bounded loop logic and declare infinite
+ * looping (and would be *technically* correct, if not for iterator's
+ * "eventual sticky NULL" contract, see process_iter_next_call()). But we
+ * don't want that. So what we do in process_iter_next_call() when we go on
+ * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
+ * a different iteration. So when we suspect an infinite loop, we additionally
+ * check if any of the *ACTIVE* iterator states depths differ. If yes, we
+ * pretend we are not looping and wait for next iter_next() call.
+ *
+ * This only applies to ACTIVE state. In DRAINED state we don't expect to
+ * loop, because that would actually mean infinite loop, as DRAINED state is
+ * "sticky", and so we'll keep returning into the same instruction with the
+ * same state (at least in one of possible code paths).
+ *
+ * This approach allows to keep infinite loop heuristic even in the face of
+ * active iterator. E.g., C snippet below is and will be detected as
+ * infinitely looping:
+ *
+ * struct bpf_iter_num it;
+ * int *p, x;
+ *
+ * bpf_iter_num_new(&it, 0, 10);
+ * while ((p = bpf_iter_num_next(&t))) {
+ * x = p;
+ * while (x--) {} // <<-- infinite loop here
+ * }
+ *
+ */
+static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
+{
+ struct bpf_reg_state *slot, *cur_slot;
+ struct bpf_func_state *state;
+ int i, fr;
+
+ for (fr = old->curframe; fr >= 0; fr--) {
+ state = old->frame[fr];
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_ITER)
+ continue;
+
+ slot = &state->stack[i].spilled_ptr;
+ if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
+ continue;
+
+ cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
+ if (cur_slot->iter.depth != slot->iter.depth)
+ return true;
+ }
+ }
+ return false;
+}
+
+static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ int i, j;
+
+ for (i = 0; i <= st->curframe; i++) {
+ func = st->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = false;
+ }
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (!bpf_is_spilled_reg(&func->stack[j]))
+ continue;
+ reg = &func->stack[j].spilled_ptr;
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = false;
+ }
+ }
+}
+
+int bpf_is_state_visited(struct bpf_verifier_env *env, int insn_idx)
+{
+ struct bpf_verifier_state_list *new_sl;
+ struct bpf_verifier_state_list *sl;
+ struct bpf_verifier_state *cur = env->cur_state, *new;
+ bool force_new_state, add_new_state, loop;
+ int n, err, states_cnt = 0;
+ struct list_head *pos, *tmp, *head;
+
+ force_new_state = env->test_state_freq || bpf_is_force_checkpoint(env, insn_idx) ||
+ /* Avoid accumulating infinitely long jmp history */
+ cur->jmp_history_cnt > 40;
+
+ /* bpf progs typically have pruning point every 4 instructions
+ * http://vger.kernel.org/bpfconf2019.html#session-1
+ * Do not add new state for future pruning if the verifier hasn't seen
+ * at least 2 jumps and at least 8 instructions.
+ * This heuristics helps decrease 'total_states' and 'peak_states' metric.
+ * In tests that amounts to up to 50% reduction into total verifier
+ * memory consumption and 20% verifier time speedup.
+ */
+ add_new_state = force_new_state;
+ if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
+ env->insn_processed - env->prev_insn_processed >= 8)
+ add_new_state = true;
+
+ /* keep cleaning the current state as registers/stack become dead */
+ err = clean_verifier_state(env, cur);
+ if (err)
+ return err;
+
+ loop = false;
+ head = bpf_explored_state(env, insn_idx);
+ list_for_each_safe(pos, tmp, head) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
+ states_cnt++;
+ if (sl->state.insn_idx != insn_idx)
+ continue;
+
+ if (sl->state.branches) {
+ struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];
+
+ if (frame->in_async_callback_fn &&
+ frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) {
+ /* Different async_entry_cnt means that the verifier is
+ * processing another entry into async callback.
+ * Seeing the same state is not an indication of infinite
+ * loop or infinite recursion.
+ * But finding the same state doesn't mean that it's safe
+ * to stop processing the current state. The previous state
+ * hasn't yet reached bpf_exit, since state.branches > 0.
+ * Checking in_async_callback_fn alone is not enough either.
+ * Since the verifier still needs to catch infinite loops
+ * inside async callbacks.
+ */
+ goto skip_inf_loop_check;
+ }
+ /* BPF open-coded iterators loop detection is special.
+ * states_maybe_looping() logic is too simplistic in detecting
+ * states that *might* be equivalent, because it doesn't know
+ * about ID remapping, so don't even perform it.
+ * See process_iter_next_call() and iter_active_depths_differ()
+ * for overview of the logic. When current and one of parent
+ * states are detected as equivalent, it's a good thing: we prove
+ * convergence and can stop simulating further iterations.
+ * It's safe to assume that iterator loop will finish, taking into
+ * account iter_next() contract of eventually returning
+ * sticky NULL result.
+ *
+ * Note, that states have to be compared exactly in this case because
+ * read and precision marks might not be finalized inside the loop.
+ * E.g. as in the program below:
+ *
+ * 1. r7 = -16
+ * 2. r6 = bpf_get_prandom_u32()
+ * 3. while (bpf_iter_num_next(&fp[-8])) {
+ * 4. if (r6 != 42) {
+ * 5. r7 = -32
+ * 6. r6 = bpf_get_prandom_u32()
+ * 7. continue
+ * 8. }
+ * 9. r0 = r10
+ * 10. r0 += r7
+ * 11. r8 = *(u64 *)(r0 + 0)
+ * 12. r6 = bpf_get_prandom_u32()
+ * 13. }
+ *
+ * Here verifier would first visit path 1-3, create a checkpoint at 3
+ * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does
+ * not have read or precision mark for r7 yet, thus inexact states
+ * comparison would discard current state with r7=-32
+ * => unsafe memory access at 11 would not be caught.
+ */
+ if (is_iter_next_insn(env, insn_idx)) {
+ if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
+ struct bpf_func_state *cur_frame;
+ struct bpf_reg_state *iter_state, *iter_reg;
+ int spi;
+
+ cur_frame = cur->frame[cur->curframe];
+ /* btf_check_iter_kfuncs() enforces that
+ * iter state pointer is always the first arg
+ */
+ iter_reg = &cur_frame->regs[BPF_REG_1];
+ /* current state is valid due to states_equal(),
+ * so we can assume valid iter and reg state,
+ * no need for extra (re-)validations
+ */
+ spi = bpf_get_spi(iter_reg->var_off.value);
+ iter_state = &bpf_func(env, iter_reg)->stack[spi].spilled_ptr;
+ if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
+ loop = true;
+ goto hit;
+ }
+ }
+ goto skip_inf_loop_check;
+ }
+ if (is_may_goto_insn_at(env, insn_idx)) {
+ if (sl->state.may_goto_depth != cur->may_goto_depth &&
+ states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
+ loop = true;
+ goto hit;
+ }
+ }
+ if (bpf_calls_callback(env, insn_idx)) {
+ if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
+ loop = true;
+ goto hit;
+ }
+ goto skip_inf_loop_check;
+ }
+ /* attempt to detect infinite loop to avoid unnecessary doomed work */
+ if (states_maybe_looping(&sl->state, cur) &&
+ states_equal(env, &sl->state, cur, EXACT) &&
+ !iter_active_depths_differ(&sl->state, cur) &&
+ sl->state.may_goto_depth == cur->may_goto_depth &&
+ sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
+ verbose_linfo(env, insn_idx, "; ");
+ verbose(env, "infinite loop detected at insn %d\n", insn_idx);
+ verbose(env, "cur state:");
+ print_verifier_state(env, cur, cur->curframe, true);
+ verbose(env, "old state:");
+ print_verifier_state(env, &sl->state, cur->curframe, true);
+ return -EINVAL;
+ }
+ /* if the verifier is processing a loop, avoid adding new state
+ * too often, since different loop iterations have distinct
+ * states and may not help future pruning.
+ * This threshold shouldn't be too low to make sure that
+ * a loop with large bound will be rejected quickly.
+ * The most abusive loop will be:
+ * r1 += 1
+ * if r1 < 1000000 goto pc-2
+ * 1M insn_procssed limit / 100 == 10k peak states.
+ * This threshold shouldn't be too high either, since states
+ * at the end of the loop are likely to be useful in pruning.
+ */
+skip_inf_loop_check:
+ if (!force_new_state &&
+ env->jmps_processed - env->prev_jmps_processed < 20 &&
+ env->insn_processed - env->prev_insn_processed < 100)
+ add_new_state = false;
+ goto miss;
+ }
+ /* See comments for mark_all_regs_read_and_precise() */
+ loop = incomplete_read_marks(env, &sl->state);
+ if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) {
+hit:
+ sl->hit_cnt++;
+
+ /* if previous state reached the exit with precision and
+ * current state is equivalent to it (except precision marks)
+ * the precision needs to be propagated back in
+ * the current state.
+ */
+ err = 0;
+ if (bpf_is_jmp_point(env, env->insn_idx))
+ err = bpf_push_jmp_history(env, cur, 0, 0);
+ err = err ? : propagate_precision(env, &sl->state, cur, NULL);
+ if (err)
+ return err;
+ /* When processing iterator based loops above propagate_liveness and
+ * propagate_precision calls are not sufficient to transfer all relevant
+ * read and precision marks. E.g. consider the following case:
+ *
+ * .-> A --. Assume the states are visited in the order A, B, C.
+ * | | | Assume that state B reaches a state equivalent to state A.
+ * | v v At this point, state C is not processed yet, so state A
+ * '-- B C has not received any read or precision marks from C.
+ * Thus, marks propagated from A to B are incomplete.
+ *
+ * The verifier mitigates this by performing the following steps:
+ *
+ * - Prior to the main verification pass, strongly connected components
+ * (SCCs) are computed over the program's control flow graph,
+ * intraprocedurally.
+ *
+ * - During the main verification pass, `maybe_enter_scc()` checks
+ * whether the current verifier state is entering an SCC. If so, an
+ * instance of a `bpf_scc_visit` object is created, and the state
+ * entering the SCC is recorded as the entry state.
+ *
+ * - This instance is associated not with the SCC itself, but with a
+ * `bpf_scc_callchain`: a tuple consisting of the call sites leading to
+ * the SCC and the SCC id. See `compute_scc_callchain()`.
+ *
+ * - When a verification path encounters a `states_equal(...,
+ * RANGE_WITHIN)` condition, there exists a call chain describing the
+ * current state and a corresponding `bpf_scc_visit` instance. A copy
+ * of the current state is created and added to
+ * `bpf_scc_visit->backedges`.
+ *
+ * - When a verification path terminates, `maybe_exit_scc()` is called
+ * from `bpf_update_branch_counts()`. For states with `branches == 0`, it
+ * checks whether the state is the entry state of any `bpf_scc_visit`
+ * instance. If it is, this indicates that all paths originating from
+ * this SCC visit have been explored. `propagate_backedges()` is then
+ * called, which propagates read and precision marks through the
+ * backedges until a fixed point is reached.
+ * (In the earlier example, this would propagate marks from A to B,
+ * from C to A, and then again from A to B.)
+ *
+ * A note on callchains
+ * --------------------
+ *
+ * Consider the following example:
+ *
+ * void foo() { loop { ... SCC#1 ... } }
+ * void main() {
+ * A: foo();
+ * B: ...
+ * C: foo();
+ * }
+ *
+ * Here, there are two distinct callchains leading to SCC#1:
+ * - (A, SCC#1)
+ * - (C, SCC#1)
+ *
+ * Each callchain identifies a separate `bpf_scc_visit` instance that
+ * accumulates backedge states. The `propagate_{liveness,precision}()`
+ * functions traverse the parent state of each backedge state, which
+ * means these parent states must remain valid (i.e., not freed) while
+ * the corresponding `bpf_scc_visit` instance exists.
+ *
+ * Associating `bpf_scc_visit` instances directly with SCCs instead of
+ * callchains would break this invariant:
+ * - States explored during `C: foo()` would contribute backedges to
+ * SCC#1, but SCC#1 would only be exited once the exploration of
+ * `A: foo()` completes.
+ * - By that time, the states explored between `A: foo()` and `C: foo()`
+ * (i.e., `B: ...`) may have already been freed, causing the parent
+ * links for states from `C: foo()` to become invalid.
+ */
+ if (loop) {
+ struct bpf_scc_backedge *backedge;
+
+ backedge = kzalloc_obj(*backedge,
+ GFP_KERNEL_ACCOUNT);
+ if (!backedge)
+ return -ENOMEM;
+ err = bpf_copy_verifier_state(&backedge->state, cur);
+ backedge->state.equal_state = &sl->state;
+ backedge->state.insn_idx = insn_idx;
+ err = err ?: add_scc_backedge(env, &sl->state, backedge);
+ if (err) {
+ bpf_free_verifier_state(&backedge->state, false);
+ kfree(backedge);
+ return err;
+ }
+ }
+ return 1;
+ }
+miss:
+ /* when new state is not going to be added do not increase miss count.
+ * Otherwise several loop iterations will remove the state
+ * recorded earlier. The goal of these heuristics is to have
+ * states from some iterations of the loop (some in the beginning
+ * and some at the end) to help pruning.
+ */
+ if (add_new_state)
+ sl->miss_cnt++;
+ /* heuristic to determine whether this state is beneficial
+ * to keep checking from state equivalence point of view.
+ * Higher numbers increase max_states_per_insn and verification time,
+ * but do not meaningfully decrease insn_processed.
+ * 'n' controls how many times state could miss before eviction.
+ * Use bigger 'n' for checkpoints because evicting checkpoint states
+ * too early would hinder iterator convergence.
+ */
+ n = bpf_is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3;
+ if (sl->miss_cnt > sl->hit_cnt * n + n) {
+ /* the state is unlikely to be useful. Remove it to
+ * speed up verification
+ */
+ sl->in_free_list = true;
+ list_del(&sl->node);
+ list_add(&sl->node, &env->free_list);
+ env->free_list_size++;
+ env->explored_states_size--;
+ maybe_free_verifier_state(env, sl);
+ }
+ }
+
+ if (env->max_states_per_insn < states_cnt)
+ env->max_states_per_insn = states_cnt;
+
+ if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
+ return 0;
+
+ if (!add_new_state)
+ return 0;
+
+ /* There were no equivalent states, remember the current one.
+ * Technically the current state is not proven to be safe yet,
+ * but it will either reach outer most bpf_exit (which means it's safe)
+ * or it will be rejected. When there are no loops the verifier won't be
+ * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
+ * again on the way to bpf_exit.
+ * When looping the sl->state.branches will be > 0 and this state
+ * will not be considered for equivalence until branches == 0.
+ */
+ new_sl = kzalloc_obj(struct bpf_verifier_state_list, GFP_KERNEL_ACCOUNT);
+ if (!new_sl)
+ return -ENOMEM;
+ env->total_states++;
+ env->explored_states_size++;
+ update_peak_states(env);
+ env->prev_jmps_processed = env->jmps_processed;
+ env->prev_insn_processed = env->insn_processed;
+
+ /* forget precise markings we inherited, see __mark_chain_precision */
+ if (env->bpf_capable)
+ mark_all_scalars_imprecise(env, cur);
+
+ bpf_clear_singular_ids(env, cur);
+
+ /* add new state to the head of linked list */
+ new = &new_sl->state;
+ err = bpf_copy_verifier_state(new, cur);
+ if (err) {
+ bpf_free_verifier_state(new, false);
+ kfree(new_sl);
+ return err;
+ }
+ new->insn_idx = insn_idx;
+ verifier_bug_if(new->branches != 1, env,
+ "%s:branches_to_explore=%d insn %d",
+ __func__, new->branches, insn_idx);
+ err = maybe_enter_scc(env, new);
+ if (err) {
+ bpf_free_verifier_state(new, false);
+ kfree(new_sl);
+ return err;
+ }
+
+ cur->parent = new;
+ cur->first_insn_idx = insn_idx;
+ cur->dfs_depth = new->dfs_depth + 1;
+ bpf_clear_jmp_history(cur);
+ list_add(&new_sl->node, head);
+ return 0;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 700938782bed..b73b25c63073 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -941,14 +941,6 @@ static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu));
}
-static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
-{
- if (rcu_trace_implies_rcu_gp())
- bpf_map_free_rcu_gp(rcu);
- else
- call_rcu(rcu, bpf_map_free_rcu_gp);
-}
-
/* decrement map refcnt and schedule it for freeing via workqueue
* (underlying map implementation ops->map_free() might sleep)
*/
@@ -959,8 +951,9 @@ void bpf_map_put(struct bpf_map *map)
bpf_map_free_id(map);
WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
+ /* RCU tasks trace grace period implies RCU grace period. */
if (READ_ONCE(map->free_after_mult_rcu_gp))
- call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp);
+ call_rcu_tasks_trace(&map->rcu, bpf_map_free_rcu_gp);
else if (READ_ONCE(map->free_after_rcu_gp))
call_rcu(&map->rcu, bpf_map_free_rcu_gp);
else
@@ -2832,7 +2825,7 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr
sig = kvmemdup_bpfptr(usig, attr->signature_size);
if (IS_ERR(sig)) {
bpf_key_put(key);
- return -ENOMEM;
+ return PTR_ERR(sig);
}
bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0,
@@ -3273,14 +3266,6 @@ static bool bpf_link_is_tracepoint(struct bpf_link *link)
(link->type == BPF_LINK_TYPE_TRACING && link->attach_type == BPF_TRACE_RAW_TP);
}
-static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
-{
- if (rcu_trace_implies_rcu_gp())
- bpf_link_defer_dealloc_rcu_gp(rcu);
- else
- call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
-}
-
/* bpf_link_free is guaranteed to be called from process context */
static void bpf_link_free(struct bpf_link *link)
{
@@ -3306,7 +3291,8 @@ static void bpf_link_free(struct bpf_link *link)
* faultable case, since it exclusively uses RCU Tasks Trace.
*/
if (link->sleepable || (link->prog && link->prog->sleepable))
- call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
+ /* RCU Tasks Trace grace period implies RCU grace period. */
+ call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
/* We need to do a SRCU grace period wait for non-faultable tracepoint BPF links. */
else if (bpf_link_is_tracepoint(link))
call_tracepoint_unregister_atomic(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
@@ -3754,6 +3740,23 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
tr = prog->aux->dst_trampoline;
tgt_prog = prog->aux->dst_prog;
}
+ /*
+ * It is to prevent modifying struct pt_regs via kprobe_write_ctx=true
+ * freplace prog. Without this check, kprobe_write_ctx=true freplace
+ * prog is allowed to attach to kprobe_write_ctx=false kprobe prog, and
+ * then modify the registers of the kprobe prog's target kernel
+ * function.
+ *
+ * This also blocks the combination of uprobe+freplace, because it is
+ * unable to recognize the use of the tgt_prog as an uprobe or a kprobe
+ * by tgt_prog itself. At attach time, uprobe/kprobe is recognized by
+ * the target perf event flags in __perf_event_set_bpf_prog().
+ */
+ if (prog->type == BPF_PROG_TYPE_EXT &&
+ prog->aux->kprobe_write_ctx != tgt_prog->aux->kprobe_write_ctx) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
err = bpf_link_prime(&link->link.link, &link_primer);
if (err)
@@ -6369,8 +6372,7 @@ static bool syscall_prog_is_valid_access(int off, int size,
{
if (off < 0 || off >= U16_MAX)
return false;
- if (off % size != 0)
- return false;
+ /* No alignment requirements for syscall ctx accesses. */
return true;
}
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 98d9b4c0daff..e791ae065c39 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -9,6 +9,8 @@
#include <linux/bpf_mem_alloc.h>
#include <linux/btf_ids.h>
#include <linux/mm_types.h>
+#include <linux/mmap_lock.h>
+#include <linux/sched/mm.h>
#include "mmap_unlock_work.h"
static const char * const iter_task_type_names[] = {
@@ -794,11 +796,20 @@ const struct bpf_func_proto bpf_find_vma_proto = {
.arg5_type = ARG_ANYTHING,
};
+static inline void bpf_iter_mmput_async(struct mm_struct *mm)
+{
+#ifdef CONFIG_MMU
+ mmput_async(mm);
+#else
+ mmput(mm);
+#endif
+}
+
struct bpf_iter_task_vma_kern_data {
struct task_struct *task;
struct mm_struct *mm;
- struct mmap_unlock_irq_work *work;
- struct vma_iterator vmi;
+ struct vm_area_struct snapshot;
+ u64 next_addr;
};
struct bpf_iter_task_vma {
@@ -819,12 +830,28 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
struct task_struct *task, u64 addr)
{
struct bpf_iter_task_vma_kern *kit = (void *)it;
- bool irq_work_busy = false;
int err;
BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
+ if (!IS_ENABLED(CONFIG_PER_VMA_LOCK)) {
+ kit->data = NULL;
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * Reject irqs-disabled contexts including NMI. Operations used
+ * by _next() and _destroy() (vma_end_read, fput, bpf_iter_mmput_async)
+ * can take spinlocks with IRQs disabled (pi_lock, pool->lock).
+ * Running from NMI or from a tracepoint that fires with those
+ * locks held could deadlock.
+ */
+ if (irqs_disabled()) {
+ kit->data = NULL;
+ return -EBUSY;
+ }
+
/* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
* before, so non-NULL kit->data doesn't point to previously
* bpf_mem_alloc'd bpf_iter_task_vma_kern_data
@@ -834,38 +861,131 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
return -ENOMEM;
kit->data->task = get_task_struct(task);
+ /*
+ * Safely read task->mm and acquire an mm reference.
+ *
+ * Cannot use get_task_mm() because its task_lock() is a
+ * blocking spin_lock that would deadlock if the target task
+ * already holds alloc_lock on this CPU (e.g. a softirq BPF
+ * program iterating a task interrupted while holding its
+ * alloc_lock).
+ */
+ if (!spin_trylock(&task->alloc_lock)) {
+ err = -EBUSY;
+ goto err_cleanup_iter;
+ }
kit->data->mm = task->mm;
+ if (kit->data->mm && !(task->flags & PF_KTHREAD))
+ mmget(kit->data->mm);
+ else
+ kit->data->mm = NULL;
+ spin_unlock(&task->alloc_lock);
if (!kit->data->mm) {
err = -ENOENT;
goto err_cleanup_iter;
}
- /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
- irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
- if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
- err = -EBUSY;
- goto err_cleanup_iter;
- }
-
- vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
+ kit->data->snapshot.vm_file = NULL;
+ kit->data->next_addr = addr;
return 0;
err_cleanup_iter:
- if (kit->data->task)
- put_task_struct(kit->data->task);
+ put_task_struct(kit->data->task);
bpf_mem_free(&bpf_global_ma, kit->data);
/* NULL kit->data signals failed bpf_iter_task_vma initialization */
kit->data = NULL;
return err;
}
+/*
+ * Find and lock the next VMA at or after data->next_addr.
+ *
+ * lock_vma_under_rcu() is a point lookup (mas_walk): it finds the VMA
+ * containing a given address but cannot iterate. An RCU-protected
+ * maple tree walk with vma_next() (mas_find) is needed first to locate
+ * the next VMA's vm_start across any gap.
+ *
+ * Between the RCU walk and the lock, the VMA may be removed, shrunk,
+ * or write-locked. On failure, advance past it using vm_end from the
+ * RCU walk. SLAB_TYPESAFE_BY_RCU can make vm_end stale, so fall back
+ * to PAGE_SIZE advancement to guarantee forward progress.
+ */
+static struct vm_area_struct *
+bpf_iter_task_vma_find_next(struct bpf_iter_task_vma_kern_data *data)
+{
+ struct vm_area_struct *vma;
+ struct vma_iterator vmi;
+ unsigned long start, end;
+
+retry:
+ rcu_read_lock();
+ vma_iter_init(&vmi, data->mm, data->next_addr);
+ vma = vma_next(&vmi);
+ if (!vma) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ start = vma->vm_start;
+ end = vma->vm_end;
+ rcu_read_unlock();
+
+ vma = lock_vma_under_rcu(data->mm, start);
+ if (!vma) {
+ if (end <= data->next_addr)
+ data->next_addr += PAGE_SIZE;
+ else
+ data->next_addr = end;
+ goto retry;
+ }
+
+ if (unlikely(vma->vm_end <= data->next_addr)) {
+ data->next_addr += PAGE_SIZE;
+ vma_end_read(vma);
+ goto retry;
+ }
+
+ return vma;
+}
+
+static void bpf_iter_task_vma_snapshot_reset(struct vm_area_struct *snap)
+{
+ if (snap->vm_file) {
+ fput(snap->vm_file);
+ snap->vm_file = NULL;
+ }
+}
+
__bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
{
struct bpf_iter_task_vma_kern *kit = (void *)it;
+ struct vm_area_struct *snap, *vma;
if (!kit->data) /* bpf_iter_task_vma_new failed */
return NULL;
- return vma_next(&kit->data->vmi);
+
+ snap = &kit->data->snapshot;
+
+ bpf_iter_task_vma_snapshot_reset(snap);
+
+ vma = bpf_iter_task_vma_find_next(kit->data);
+ if (!vma)
+ return NULL;
+
+ memcpy(snap, vma, sizeof(*snap));
+
+ /*
+ * The verifier only trusts vm_mm and vm_file (see
+ * BTF_TYPE_SAFE_TRUSTED_OR_NULL in verifier.c). Take a reference
+ * on vm_file; vm_mm is already correct because lock_vma_under_rcu()
+ * verifies vma->vm_mm == mm. All other pointers are untrusted by
+ * the verifier and left as-is.
+ */
+ if (snap->vm_file)
+ get_file(snap->vm_file);
+
+ kit->data->next_addr = vma->vm_end;
+ vma_end_read(vma);
+ return snap;
}
__bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
@@ -873,8 +993,9 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
struct bpf_iter_task_vma_kern *kit = (void *)it;
if (kit->data) {
- bpf_mmap_unlock_mm(kit->data->work, kit->data->mm);
+ bpf_iter_task_vma_snapshot_reset(&kit->data->snapshot);
put_task_struct(kit->data->task);
+ bpf_iter_mmput_async(kit->data->mm);
bpf_mem_free(&bpf_global_ma, kit->data);
}
}
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 4abc359b3db0..ec9c310cf5d7 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -286,8 +286,7 @@ struct tnum tnum_bswap64(struct tnum a)
*/
u64 tnum_step(struct tnum t, u64 z)
{
- u64 tmax, j, p, q, r, s, v, u, w, res;
- u8 k;
+ u64 tmax, d, carry_mask, filled, inc;
tmax = t.value | t.mask;
@@ -299,29 +298,22 @@ u64 tnum_step(struct tnum t, u64 z)
if (z < t.value)
return t.value;
- /* keep t's known bits, and match all unknown bits to z */
- j = t.value | (z & t.mask);
-
- if (j > z) {
- p = ~z & t.value & ~t.mask;
- k = fls64(p); /* k is the most-significant 0-to-1 flip */
- q = U64_MAX << k;
- r = q & z; /* positions > k matched to z */
- s = ~q & t.value; /* positions <= k matched to t.value */
- v = r | s;
- res = v;
- } else {
- p = z & ~t.value & ~t.mask;
- k = fls64(p); /* k is the most-significant 1-to-0 flip */
- q = U64_MAX << k;
- r = q & t.mask & z; /* unknown positions > k, matched to z */
- s = q & ~t.mask; /* known positions > k, set to 1 */
- v = r | s;
- /* add 1 to unknown positions > k to make value greater than z */
- u = v + (1ULL << k);
- /* extract bits in unknown positions > k from u, rest from t.value */
- w = (u & t.mask) | t.value;
- res = w;
- }
- return res;
+ /*
+ * Let r be the result tnum member, z = t.value + d.
+ * Every tnum member is t.value | s for some submask s of t.mask,
+ * and since t.value & t.mask == 0, t.value | s == t.value + s.
+ * So r > z becomes s > d where d = z - t.value.
+ *
+ * Find the smallest submask s of t.mask greater than d by
+ * "incrementing d within the mask": fill every non-mask
+ * position with 1 (`filled`) so +1 ripples through the gaps,
+ * then keep only mask bits. `carry_mask` additionally fills
+ * positions below the highest non-mask 1 in d, preventing
+ * it from trapping the carry.
+ */
+ d = z - t.value;
+ carry_mask = (1ULL << fls64(d & ~t.mask)) - 1;
+ filled = d | carry_mask | ~t.mask;
+ inc = (filled + 1) & t.mask;
+ return t.value | inc;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e3814152b52f..9e4980128151 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -195,9 +195,6 @@ struct bpf_verifier_stack_elem {
#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192
#define BPF_COMPLEXITY_LIMIT_STATES 64
-#define BPF_MAP_KEY_POISON (1ULL << 63)
-#define BPF_MAP_KEY_SEEN (1ULL << 62)
-
#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512
#define BPF_PRIV_STACK_MIN_SIZE 64
@@ -210,16 +207,10 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
static int ref_set_non_owning(struct bpf_verifier_env *env,
struct bpf_reg_state *reg);
static bool is_trusted_reg(const struct bpf_reg_state *reg);
-
-static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
-{
- return aux->map_ptr_state.poison;
-}
-
-static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
-{
- return aux->map_ptr_state.unpriv;
-}
+static inline bool in_sleepable_context(struct bpf_verifier_env *env);
+static const char *non_sleepable_context_description(struct bpf_verifier_env *env);
+static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg);
+static void scalar_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg);
static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
struct bpf_map *map,
@@ -231,21 +222,6 @@ static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
aux->map_ptr_state.map_ptr = map;
}
-static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
-{
- return aux->map_key_state & BPF_MAP_KEY_POISON;
-}
-
-static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux)
-{
- return !(aux->map_key_state & BPF_MAP_KEY_SEEN);
-}
-
-static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux)
-{
- return aux->map_key_state & ~(BPF_MAP_KEY_SEEN | BPF_MAP_KEY_POISON);
-}
-
static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
{
bool poisoned = bpf_map_key_poisoned(aux);
@@ -254,29 +230,6 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
(poisoned ? BPF_MAP_KEY_POISON : 0ULL);
}
-static bool bpf_helper_call(const struct bpf_insn *insn)
-{
- return insn->code == (BPF_JMP | BPF_CALL) &&
- insn->src_reg == 0;
-}
-
-static bool bpf_pseudo_call(const struct bpf_insn *insn)
-{
- return insn->code == (BPF_JMP | BPF_CALL) &&
- insn->src_reg == BPF_PSEUDO_CALL;
-}
-
-static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn)
-{
- return insn->code == (BPF_JMP | BPF_CALL) &&
- insn->src_reg == BPF_PSEUDO_KFUNC_CALL;
-}
-
-struct bpf_map_desc {
- struct bpf_map *ptr;
- int uid;
-};
-
struct bpf_call_arg_meta {
struct bpf_map_desc map;
bool raw_mode;
@@ -306,59 +259,6 @@ struct bpf_kfunc_meta {
s32 id;
};
-struct bpf_kfunc_call_arg_meta {
- /* In parameters */
- struct btf *btf;
- u32 func_id;
- u32 kfunc_flags;
- const struct btf_type *func_proto;
- const char *func_name;
- /* Out parameters */
- u32 ref_obj_id;
- u8 release_regno;
- bool r0_rdonly;
- u32 ret_btf_id;
- u64 r0_size;
- u32 subprogno;
- struct {
- u64 value;
- bool found;
- } arg_constant;
-
- /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling,
- * generally to pass info about user-defined local kptr types to later
- * verification logic
- * bpf_obj_drop/bpf_percpu_obj_drop
- * Record the local kptr type to be drop'd
- * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type)
- * Record the local kptr type to be refcount_incr'd and use
- * arg_owning_ref to determine whether refcount_acquire should be
- * fallible
- */
- struct btf *arg_btf;
- u32 arg_btf_id;
- bool arg_owning_ref;
- bool arg_prog;
-
- struct {
- struct btf_field *field;
- } arg_list_head;
- struct {
- struct btf_field *field;
- } arg_rbtree_root;
- struct {
- enum bpf_dynptr_type type;
- u32 id;
- u32 ref_obj_id;
- } initialized_dynptr;
- struct {
- u8 spi;
- u8 frameno;
- } iter;
- struct bpf_map_desc map;
- u64 mem_size;
-};
-
struct btf *btf_vmlinux;
static const char *btf_type_name(const struct btf *btf, u32 id)
@@ -437,13 +337,36 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
return rec;
}
-static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog)
+bool bpf_subprog_is_global(const struct bpf_verifier_env *env, int subprog)
{
struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux;
return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL;
}
+static bool subprog_returns_void(struct bpf_verifier_env *env, int subprog)
+{
+ const struct btf_type *type, *func, *func_proto;
+ const struct btf *btf = env->prog->aux->btf;
+ u32 btf_id;
+
+ btf_id = env->prog->aux->func_info[subprog].type_id;
+
+ func = btf_type_by_id(btf, btf_id);
+ if (verifier_bug_if(!func, env, "btf_id %u not found", btf_id))
+ return false;
+
+ func_proto = btf_type_by_id(btf, func->type);
+ if (!func_proto)
+ return false;
+
+ type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
+ if (!type)
+ return false;
+
+ return btf_type_is_void(type);
+}
+
static const char *subprog_name(const struct bpf_verifier_env *env, int subprog)
{
struct bpf_func_info *info;
@@ -455,7 +378,7 @@ static const char *subprog_name(const struct bpf_verifier_env *env, int subprog)
return btf_type_name(env->prog->aux->btf, info->type_id);
}
-static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog)
+void bpf_mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog)
{
struct bpf_subprog_info *info = subprog_info(env, subprog);
@@ -543,13 +466,13 @@ static bool is_callback_calling_function(enum bpf_func_id func_id)
is_async_callback_calling_function(func_id);
}
-static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
+bool bpf_is_sync_callback_calling_insn(struct bpf_insn *insn)
{
return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) ||
(bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm));
}
-static bool is_async_callback_calling_insn(struct bpf_insn *insn)
+bool bpf_is_async_callback_calling_insn(struct bpf_insn *insn)
{
return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) ||
(bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm));
@@ -570,24 +493,11 @@ static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn
return false;
}
-static bool is_may_goto_insn(struct bpf_insn *insn)
+bool bpf_is_may_goto_insn(struct bpf_insn *insn)
{
return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO;
}
-static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx)
-{
- return is_may_goto_insn(&env->prog->insnsi[insn_idx]);
-}
-
-static bool is_storage_get_function(enum bpf_func_id func_id)
-{
- return func_id == BPF_FUNC_sk_storage_get ||
- func_id == BPF_FUNC_inode_storage_get ||
- func_id == BPF_FUNC_task_storage_get ||
- func_id == BPF_FUNC_cgrp_storage_get;
-}
-
static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
const struct bpf_map *map)
{
@@ -603,39 +513,6 @@ static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
return ref_obj_uses > 1;
}
-static bool is_cmpxchg_insn(const struct bpf_insn *insn)
-{
- return BPF_CLASS(insn->code) == BPF_STX &&
- BPF_MODE(insn->code) == BPF_ATOMIC &&
- insn->imm == BPF_CMPXCHG;
-}
-
-static bool is_atomic_load_insn(const struct bpf_insn *insn)
-{
- return BPF_CLASS(insn->code) == BPF_STX &&
- BPF_MODE(insn->code) == BPF_ATOMIC &&
- insn->imm == BPF_LOAD_ACQ;
-}
-
-static bool is_atomic_fetch_insn(const struct bpf_insn *insn)
-{
- return BPF_CLASS(insn->code) == BPF_STX &&
- BPF_MODE(insn->code) == BPF_ATOMIC &&
- (insn->imm & BPF_FETCH);
-}
-
-static int __get_spi(s32 off)
-{
- return (-off - 1) / BPF_REG_SIZE;
-}
-
-static struct bpf_func_state *func(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg)
-{
- struct bpf_verifier_state *cur = env->cur_state;
-
- return cur->frame[reg->frameno];
-}
static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
{
@@ -661,19 +538,19 @@ static int stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_s
return -EINVAL;
}
- off = reg->off + reg->var_off.value;
+ off = reg->var_off.value;
if (off % BPF_REG_SIZE) {
verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
return -EINVAL;
}
- spi = __get_spi(off);
+ spi = bpf_get_spi(off);
if (spi + 1 < nr_slots) {
verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
return -EINVAL;
}
- if (!is_spi_bounds_valid(func(env, reg), spi, nr_slots))
+ if (!is_spi_bounds_valid(bpf_func(env, reg), spi, nr_slots))
return -ERANGE;
return spi;
}
@@ -742,8 +619,6 @@ static void __mark_dynptr_reg(struct bpf_reg_state *reg,
enum bpf_dynptr_type type,
bool first_slot, int dynptr_id);
-static void __mark_reg_not_init(const struct bpf_verifier_env *env,
- struct bpf_reg_state *reg);
static void mark_dynptr_stack_regs(struct bpf_verifier_env *env,
struct bpf_reg_state *sreg1,
@@ -769,7 +644,7 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
enum bpf_dynptr_type type;
int spi, i, err;
@@ -821,8 +696,6 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
}
- bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
-
return 0;
}
@@ -835,15 +708,13 @@ static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_stat
state->stack[spi - 1].slot_type[i] = STACK_INVALID;
}
- __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
- __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
-
- bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
+ bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+ bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
}
static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int spi, ref_obj_id, i;
/*
@@ -902,7 +773,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env,
static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
if (!env->allow_ptr_leaks)
- __mark_reg_not_init(env, reg);
+ bpf_mark_reg_not_init(env, reg);
else
__mark_reg_unknown(env, reg);
}
@@ -927,8 +798,27 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
spi = spi + 1;
if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
- verbose(env, "cannot overwrite referenced dynptr\n");
- return -EINVAL;
+ int ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id;
+ int ref_cnt = 0;
+
+ /*
+ * A referenced dynptr can be overwritten only if there is at
+ * least one other dynptr sharing the same ref_obj_id,
+ * ensuring the reference can still be properly released.
+ */
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_DYNPTR)
+ continue;
+ if (!state->stack[i].spilled_ptr.dynptr.first_slot)
+ continue;
+ if (state->stack[i].spilled_ptr.ref_obj_id == ref_obj_id)
+ ref_cnt++;
+ }
+
+ if (ref_cnt <= 1) {
+ verbose(env, "cannot overwrite referenced dynptr\n");
+ return -EINVAL;
+ }
}
mark_stack_slot_scratched(env, spi);
@@ -953,10 +843,8 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
/* Do not release reference state, we are destroying dynptr on stack,
* not using some helper to release it. Just reset register.
*/
- __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
- __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
-
- bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
+ bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+ bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
return 0;
}
@@ -991,7 +879,7 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_
static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int i, spi;
/* This already represents first slot of initialized bpf_dynptr.
@@ -1021,7 +909,7 @@ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_re
static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
enum bpf_arg_type arg_type)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
enum bpf_dynptr_type dynptr_type;
int spi;
@@ -1051,7 +939,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, int insn_idx,
struct btf *btf, u32 btf_id, int nr_slots)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int spi, i, j, id;
spi = iter_get_spi(env, reg, nr_slots);
@@ -1083,7 +971,6 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
for (j = 0; j < BPF_REG_SIZE; j++)
slot->slot_type[j] = STACK_ITER;
- bpf_mark_stack_write(env, state->frameno, BIT(spi - i));
mark_stack_slot_scratched(env, spi - i);
}
@@ -1093,7 +980,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, int nr_slots)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int spi, i, j;
spi = iter_get_spi(env, reg, nr_slots);
@@ -1107,12 +994,11 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
if (i == 0)
WARN_ON_ONCE(release_reference(env, st->ref_obj_id));
- __mark_reg_not_init(env, st);
+ bpf_mark_reg_not_init(env, st);
for (j = 0; j < BPF_REG_SIZE; j++)
slot->slot_type[j] = STACK_INVALID;
- bpf_mark_stack_write(env, state->frameno, BIT(spi - i));
mark_stack_slot_scratched(env, spi - i);
}
@@ -1122,7 +1008,7 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, int nr_slots)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int spi, i, j;
/* For -ERANGE (i.e. spi not falling into allocated stack slots), we
@@ -1149,7 +1035,7 @@ static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
struct btf *btf, u32 btf_id, int nr_slots)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int spi, i, j;
spi = iter_get_spi(env, reg, nr_slots);
@@ -1186,7 +1072,7 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, int insn_idx,
int kfunc_class)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
struct bpf_stack_state *slot;
struct bpf_reg_state *st;
int spi, i, id;
@@ -1202,7 +1088,6 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
slot = &state->stack[spi];
st = &slot->spilled_ptr;
- bpf_mark_stack_write(env, reg->frameno, BIT(spi));
__mark_reg_known_zero(st);
st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
st->ref_obj_id = id;
@@ -1218,7 +1103,7 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
int kfunc_class)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
struct bpf_stack_state *slot;
struct bpf_reg_state *st;
int spi, i, err;
@@ -1256,9 +1141,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r
return err;
}
- __mark_reg_not_init(env, st);
-
- bpf_mark_stack_write(env, reg->frameno, BIT(spi));
+ bpf_mark_reg_not_init(env, st);
for (i = 0; i < BPF_REG_SIZE; i++)
slot->slot_type[i] = STACK_INVALID;
@@ -1269,7 +1152,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r
static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
struct bpf_stack_state *slot;
int spi, i;
@@ -1293,7 +1176,7 @@ static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bp
static int is_irq_flag_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
struct bpf_stack_state *slot;
struct bpf_reg_state *st;
int spi, i;
@@ -1331,6 +1214,7 @@ static bool is_stack_slot_special(const struct bpf_stack_state *stack)
case STACK_IRQ_FLAG:
return true;
case STACK_INVALID:
+ case STACK_POISON:
case STACK_MISC:
case STACK_ZERO:
return false;
@@ -1343,26 +1227,12 @@ static bool is_stack_slot_special(const struct bpf_stack_state *stack)
/* The reg state of a pointer or a bounded scalar was saved when
* it was spilled to the stack.
*/
-static bool is_spilled_reg(const struct bpf_stack_state *stack)
-{
- return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL;
-}
-
-static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack)
-{
- return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL &&
- stack->spilled_ptr.type == SCALAR_VALUE;
-}
-
-static bool is_spilled_scalar_reg64(const struct bpf_stack_state *stack)
-{
- return stack->slot_type[0] == STACK_SPILL &&
- stack->spilled_ptr.type == SCALAR_VALUE;
-}
-/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which
- * case they are equivalent, or it's STACK_ZERO, in which case we preserve
- * more precise STACK_ZERO.
+/*
+ * Mark stack slot as STACK_MISC, unless it is already:
+ * - STACK_INVALID, in which case they are equivalent.
+ * - STACK_ZERO, in which case we preserve more precise STACK_ZERO.
+ * - STACK_POISON, which truly forbids access to the slot.
* Regardless of allow_ptr_leaks setting (i.e., privileged or unprivileged
* mode), we won't promote STACK_INVALID to STACK_MISC. In privileged case it is
* unnecessary as both are considered equivalent when loading data and pruning,
@@ -1373,14 +1243,14 @@ static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype)
{
if (*stype == STACK_ZERO)
return;
- if (*stype == STACK_INVALID)
+ if (*stype == STACK_INVALID || *stype == STACK_POISON)
return;
*stype = STACK_MISC;
}
static void scrub_spilled_slot(u8 *stype)
{
- if (*stype != STACK_INVALID)
+ if (*stype != STACK_INVALID && *stype != STACK_POISON)
*stype = STACK_MISC;
}
@@ -1669,14 +1539,6 @@ static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *st
return NULL;
}
-static void update_peak_states(struct bpf_verifier_env *env)
-{
- u32 cur_states;
-
- cur_states = env->explored_states_size + env->free_list_size + env->num_backedges;
- env->peak_states = max(env->peak_states, cur_states);
-}
-
static void free_func_state(struct bpf_func_state *state)
{
if (!state)
@@ -1685,15 +1547,15 @@ static void free_func_state(struct bpf_func_state *state)
kfree(state);
}
-static void clear_jmp_history(struct bpf_verifier_state *state)
+void bpf_clear_jmp_history(struct bpf_verifier_state *state)
{
kfree(state->jmp_history);
state->jmp_history = NULL;
state->jmp_history_cnt = 0;
}
-static void free_verifier_state(struct bpf_verifier_state *state,
- bool free_self)
+void bpf_free_verifier_state(struct bpf_verifier_state *state,
+ bool free_self)
{
int i;
@@ -1702,42 +1564,11 @@ static void free_verifier_state(struct bpf_verifier_state *state,
state->frame[i] = NULL;
}
kfree(state->refs);
- clear_jmp_history(state);
+ bpf_clear_jmp_history(state);
if (free_self)
kfree(state);
}
-/* struct bpf_verifier_state->parent refers to states
- * that are in either of env->{expored_states,free_list}.
- * In both cases the state is contained in struct bpf_verifier_state_list.
- */
-static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_state *st)
-{
- if (st->parent)
- return container_of(st->parent, struct bpf_verifier_state_list, state);
- return NULL;
-}
-
-static bool incomplete_read_marks(struct bpf_verifier_env *env,
- struct bpf_verifier_state *st);
-
-/* A state can be freed if it is no longer referenced:
- * - is in the env->free_list;
- * - has no children states;
- */
-static void maybe_free_verifier_state(struct bpf_verifier_env *env,
- struct bpf_verifier_state_list *sl)
-{
- if (!sl->in_free_list
- || sl->state.branches != 0
- || incomplete_read_marks(env, &sl->state))
- return;
- list_del(&sl->node);
- free_verifier_state(&sl->state, false);
- kfree(sl);
- env->free_list_size--;
-}
-
/* copy verifier state from src to dst growing dst stack space
* when necessary to accommodate larger src stack
*/
@@ -1748,8 +1579,8 @@ static int copy_func_state(struct bpf_func_state *dst,
return copy_stack_state(dst, src);
}
-static int copy_verifier_state(struct bpf_verifier_state *dst_state,
- const struct bpf_verifier_state *src)
+int bpf_copy_verifier_state(struct bpf_verifier_state *dst_state,
+ const struct bpf_verifier_state *src)
{
struct bpf_func_state *dst;
int i, err;
@@ -1773,7 +1604,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
return err;
dst_state->speculative = src->speculative;
dst_state->in_sleepable = src->in_sleepable;
- dst_state->cleaned = src->cleaned;
dst_state->curframe = src->curframe;
dst_state->branches = src->branches;
dst_state->parent = src->parent;
@@ -1803,7 +1633,7 @@ static u32 state_htab_size(struct bpf_verifier_env *env)
return env->prog->len;
}
-static struct list_head *explored_state(struct bpf_verifier_env *env, int idx)
+struct list_head *bpf_explored_state(struct bpf_verifier_env *env, int idx)
{
struct bpf_verifier_state *cur = env->cur_state;
struct bpf_func_state *state = cur->frame[cur->curframe];
@@ -1825,266 +1655,19 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta
return true;
}
-/* Return IP for a given frame in a call stack */
-static u32 frame_insn_idx(struct bpf_verifier_state *st, u32 frame)
-{
- return frame == st->curframe
- ? st->insn_idx
- : st->frame[frame + 1]->callsite;
-}
-
-/* For state @st look for a topmost frame with frame_insn_idx() in some SCC,
- * if such frame exists form a corresponding @callchain as an array of
- * call sites leading to this frame and SCC id.
- * E.g.:
- *
- * void foo() { A: loop {... SCC#1 ...}; }
- * void bar() { B: loop { C: foo(); ... SCC#2 ... }
- * D: loop { E: foo(); ... SCC#3 ... } }
- * void main() { F: bar(); }
- *
- * @callchain at (A) would be either (F,SCC#2) or (F,SCC#3) depending
- * on @st frame call sites being (F,C,A) or (F,E,A).
- */
-static bool compute_scc_callchain(struct bpf_verifier_env *env,
- struct bpf_verifier_state *st,
- struct bpf_scc_callchain *callchain)
-{
- u32 i, scc, insn_idx;
-
- memset(callchain, 0, sizeof(*callchain));
- for (i = 0; i <= st->curframe; i++) {
- insn_idx = frame_insn_idx(st, i);
- scc = env->insn_aux_data[insn_idx].scc;
- if (scc) {
- callchain->scc = scc;
- break;
- } else if (i < st->curframe) {
- callchain->callsites[i] = insn_idx;
- } else {
- return false;
- }
- }
- return true;
-}
-
-/* Check if bpf_scc_visit instance for @callchain exists. */
-static struct bpf_scc_visit *scc_visit_lookup(struct bpf_verifier_env *env,
- struct bpf_scc_callchain *callchain)
-{
- struct bpf_scc_info *info = env->scc_info[callchain->scc];
- struct bpf_scc_visit *visits = info->visits;
- u32 i;
- if (!info)
- return NULL;
- for (i = 0; i < info->num_visits; i++)
- if (memcmp(callchain, &visits[i].callchain, sizeof(*callchain)) == 0)
- return &visits[i];
- return NULL;
-}
-
-/* Allocate a new bpf_scc_visit instance corresponding to @callchain.
- * Allocated instances are alive for a duration of the do_check_common()
- * call and are freed by free_states().
- */
-static struct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env,
- struct bpf_scc_callchain *callchain)
-{
- struct bpf_scc_visit *visit;
- struct bpf_scc_info *info;
- u32 scc, num_visits;
- u64 new_sz;
-
- scc = callchain->scc;
- info = env->scc_info[scc];
- num_visits = info ? info->num_visits : 0;
- new_sz = sizeof(*info) + sizeof(struct bpf_scc_visit) * (num_visits + 1);
- info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL_ACCOUNT);
- if (!info)
- return NULL;
- env->scc_info[scc] = info;
- info->num_visits = num_visits + 1;
- visit = &info->visits[num_visits];
- memset(visit, 0, sizeof(*visit));
- memcpy(&visit->callchain, callchain, sizeof(*callchain));
- return visit;
-}
-
-/* Form a string '(callsite#1,callsite#2,...,scc)' in env->tmp_str_buf */
-static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain)
-{
- char *buf = env->tmp_str_buf;
- int i, delta = 0;
-
- delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "(");
- for (i = 0; i < ARRAY_SIZE(callchain->callsites); i++) {
- if (!callchain->callsites[i])
- break;
- delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u,",
- callchain->callsites[i]);
- }
- delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u)", callchain->scc);
- return env->tmp_str_buf;
-}
-
-/* If callchain for @st exists (@st is in some SCC), ensure that
- * bpf_scc_visit instance for this callchain exists.
- * If instance does not exist or is empty, assign visit->entry_state to @st.
- */
-static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
-{
- struct bpf_scc_callchain *callchain = &env->callchain_buf;
- struct bpf_scc_visit *visit;
-
- if (!compute_scc_callchain(env, st, callchain))
- return 0;
- visit = scc_visit_lookup(env, callchain);
- visit = visit ?: scc_visit_alloc(env, callchain);
- if (!visit)
- return -ENOMEM;
- if (!visit->entry_state) {
- visit->entry_state = st;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "SCC enter %s\n", format_callchain(env, callchain));
- }
- return 0;
-}
-
-static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit);
-
-/* If callchain for @st exists (@st is in some SCC), make it empty:
- * - set visit->entry_state to NULL;
- * - flush accumulated backedges.
- */
-static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
-{
- struct bpf_scc_callchain *callchain = &env->callchain_buf;
- struct bpf_scc_visit *visit;
-
- if (!compute_scc_callchain(env, st, callchain))
- return 0;
- visit = scc_visit_lookup(env, callchain);
- if (!visit) {
- /*
- * If path traversal stops inside an SCC, corresponding bpf_scc_visit
- * must exist for non-speculative paths. For non-speculative paths
- * traversal stops when:
- * a. Verification error is found, maybe_exit_scc() is not called.
- * b. Top level BPF_EXIT is reached. Top level BPF_EXIT is not a member
- * of any SCC.
- * c. A checkpoint is reached and matched. Checkpoints are created by
- * is_state_visited(), which calls maybe_enter_scc(), which allocates
- * bpf_scc_visit instances for checkpoints within SCCs.
- * (c) is the only case that can reach this point.
- */
- if (!st->speculative) {
- verifier_bug(env, "scc exit: no visit info for call chain %s",
- format_callchain(env, callchain));
- return -EFAULT;
- }
- return 0;
- }
- if (visit->entry_state != st)
- return 0;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "SCC exit %s\n", format_callchain(env, callchain));
- visit->entry_state = NULL;
- env->num_backedges -= visit->num_backedges;
- visit->num_backedges = 0;
- update_peak_states(env);
- return propagate_backedges(env, visit);
-}
-
-/* Lookup an bpf_scc_visit instance corresponding to @st callchain
- * and add @backedge to visit->backedges. @st callchain must exist.
- */
-static int add_scc_backedge(struct bpf_verifier_env *env,
- struct bpf_verifier_state *st,
- struct bpf_scc_backedge *backedge)
-{
- struct bpf_scc_callchain *callchain = &env->callchain_buf;
- struct bpf_scc_visit *visit;
-
- if (!compute_scc_callchain(env, st, callchain)) {
- verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d",
- st->insn_idx);
- return -EFAULT;
- }
- visit = scc_visit_lookup(env, callchain);
- if (!visit) {
- verifier_bug(env, "add backedge: no visit info for call chain %s",
- format_callchain(env, callchain));
- return -EFAULT;
- }
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "SCC backedge %s\n", format_callchain(env, callchain));
- backedge->next = visit->backedges;
- visit->backedges = backedge;
- visit->num_backedges++;
- env->num_backedges++;
- update_peak_states(env);
- return 0;
-}
-
-/* bpf_reg_state->live marks for registers in a state @st are incomplete,
- * if state @st is in some SCC and not all execution paths starting at this
- * SCC are fully explored.
- */
-static bool incomplete_read_marks(struct bpf_verifier_env *env,
- struct bpf_verifier_state *st)
-{
- struct bpf_scc_callchain *callchain = &env->callchain_buf;
- struct bpf_scc_visit *visit;
-
- if (!compute_scc_callchain(env, st, callchain))
- return false;
- visit = scc_visit_lookup(env, callchain);
- if (!visit)
- return false;
- return !!visit->backedges;
-}
-
-static void free_backedges(struct bpf_scc_visit *visit)
+void bpf_free_backedges(struct bpf_scc_visit *visit)
{
struct bpf_scc_backedge *backedge, *next;
for (backedge = visit->backedges; backedge; backedge = next) {
- free_verifier_state(&backedge->state, false);
+ bpf_free_verifier_state(&backedge->state, false);
next = backedge->next;
kfree(backedge);
}
visit->backedges = NULL;
}
-static int update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
-{
- struct bpf_verifier_state_list *sl = NULL, *parent_sl;
- struct bpf_verifier_state *parent;
- int err;
-
- while (st) {
- u32 br = --st->branches;
-
- /* verifier_bug_if(br > 1, ...) technically makes sense here,
- * but see comment in push_stack(), hence:
- */
- verifier_bug_if((int)br < 0, env, "%s:branches_to_explore=%d", __func__, br);
- if (br)
- break;
- err = maybe_exit_scc(env, st);
- if (err)
- return err;
- parent = st->parent;
- parent_sl = state_parent_as_list(st);
- if (sl)
- maybe_free_verifier_state(env, sl);
- st = parent;
- sl = parent_sl;
- }
- return 0;
-}
-
static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
int *insn_idx, bool pop_log)
{
@@ -2096,7 +1679,7 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
return -ENOENT;
if (cur) {
- err = copy_verifier_state(cur, &head->st);
+ err = bpf_copy_verifier_state(cur, &head->st);
if (err)
return err;
}
@@ -2107,7 +1690,7 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
if (prev_insn_idx)
*prev_insn_idx = head->prev_insn_idx;
elem = head->next;
- free_verifier_state(&head->st, false);
+ bpf_free_verifier_state(&head->st, false);
kfree(head);
env->head = elem;
env->stack_size--;
@@ -2144,7 +1727,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
elem->log_pos = env->log.end_pos;
env->head = elem;
env->stack_size++;
- err = copy_verifier_state(&elem->st, cur);
+ err = bpf_copy_verifier_state(&elem->st, cur);
if (err)
return ERR_PTR(-ENOMEM);
elem->st.speculative |= speculative;
@@ -2168,7 +1751,6 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
return &elem->st;
}
-#define CALLER_SAVED_REGS 6
static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
@@ -2231,13 +1813,6 @@ static void __mark_reg_const_zero(const struct bpf_verifier_env *env, struct bpf
static void mark_reg_known_zero(struct bpf_verifier_env *env,
struct bpf_reg_state *regs, u32 regno)
{
- if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
- /* Something bad happened, let's kill all regs */
- for (regno = 0; regno < MAX_BPF_REG; regno++)
- __mark_reg_not_init(env, regs + regno);
- return;
- }
__mark_reg_known_zero(regs + regno);
}
@@ -2288,11 +1863,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
static void mark_reg_graph_node(struct bpf_reg_state *regs, u32 regno,
struct btf_field_graph_root *ds_head)
{
- __mark_reg_known_zero(&regs[regno]);
+ __mark_reg_known(&regs[regno], ds_head->node_offset);
regs[regno].type = PTR_TO_BTF_ID | MEM_ALLOC;
regs[regno].btf = ds_head->btf;
regs[regno].btf_id = ds_head->value_btf_id;
- regs[regno].off = ds_head->node_offset;
}
static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
@@ -2323,7 +1897,6 @@ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
*/
return reg->type == which &&
reg->id == 0 &&
- reg->off == 0 &&
tnum_equals_const(reg->var_off, 0);
}
@@ -2434,7 +2007,7 @@ static void __update_reg_bounds(struct bpf_reg_state *reg)
}
/* Uses signed min/max values to inform unsigned, and vice-versa */
-static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
+static void deduce_bounds_32_from_64(struct bpf_reg_state *reg)
{
/* If upper 32 bits of u64/s64 range don't change, we can use lower 32
* bits to improve our u32/s32 boundaries.
@@ -2504,6 +2077,10 @@ static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
}
+}
+
+static void deduce_bounds_32_from_32(struct bpf_reg_state *reg)
+{
/* if u32 range forms a valid s32 range (due to matching sign bit),
* try to learn from that
*/
@@ -2545,7 +2122,7 @@ static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
}
}
-static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
+static void deduce_bounds_64_from_64(struct bpf_reg_state *reg)
{
/* If u64 range forms a valid s64 range (due to matching sign bit),
* try to learn from that. Let's do a bit of ASCII art to see when
@@ -2680,7 +2257,7 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
}
}
-static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
+static void deduce_bounds_64_from_32(struct bpf_reg_state *reg)
{
/* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit
* values on both sides of 64-bit range in hope to have tighter range.
@@ -2749,9 +2326,10 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
static void __reg_deduce_bounds(struct bpf_reg_state *reg)
{
- __reg32_deduce_bounds(reg);
- __reg64_deduce_bounds(reg);
- __reg_deduce_mixed_bounds(reg);
+ deduce_bounds_64_from_64(reg);
+ deduce_bounds_32_from_64(reg);
+ deduce_bounds_32_from_32(reg);
+ deduce_bounds_64_from_32(reg);
}
/* Attempts to improve var_off based on unsigned min/max information */
@@ -2767,14 +2345,18 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)
reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}
+static bool range_bounds_violation(struct bpf_reg_state *reg);
+
static void reg_bounds_sync(struct bpf_reg_state *reg)
{
+ /* If the input reg_state is invalid, we can exit early */
+ if (range_bounds_violation(reg))
+ return;
/* We might have learned new bounds from the var_off. */
__update_reg_bounds(reg);
/* We might have learned something about the sign bit. */
__reg_deduce_bounds(reg);
__reg_deduce_bounds(reg);
- __reg_deduce_bounds(reg);
/* We might have learned some bits from the bounds. */
__reg_bound_offset(reg);
/* Intersecting with the old var_off might have improved our bounds
@@ -2784,39 +2366,55 @@ static void reg_bounds_sync(struct bpf_reg_state *reg)
__update_reg_bounds(reg);
}
+static bool range_bounds_violation(struct bpf_reg_state *reg)
+{
+ return (reg->umin_value > reg->umax_value || reg->smin_value > reg->smax_value ||
+ reg->u32_min_value > reg->u32_max_value ||
+ reg->s32_min_value > reg->s32_max_value);
+}
+
+static bool const_tnum_range_mismatch(struct bpf_reg_state *reg)
+{
+ u64 uval = reg->var_off.value;
+ s64 sval = (s64)uval;
+
+ if (!tnum_is_const(reg->var_off))
+ return false;
+
+ return reg->umin_value != uval || reg->umax_value != uval ||
+ reg->smin_value != sval || reg->smax_value != sval;
+}
+
+static bool const_tnum_range_mismatch_32(struct bpf_reg_state *reg)
+{
+ u32 uval32 = tnum_subreg(reg->var_off).value;
+ s32 sval32 = (s32)uval32;
+
+ if (!tnum_subreg_is_const(reg->var_off))
+ return false;
+
+ return reg->u32_min_value != uval32 || reg->u32_max_value != uval32 ||
+ reg->s32_min_value != sval32 || reg->s32_max_value != sval32;
+}
+
static int reg_bounds_sanity_check(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, const char *ctx)
{
const char *msg;
- if (reg->umin_value > reg->umax_value ||
- reg->smin_value > reg->smax_value ||
- reg->u32_min_value > reg->u32_max_value ||
- reg->s32_min_value > reg->s32_max_value) {
- msg = "range bounds violation";
- goto out;
+ if (range_bounds_violation(reg)) {
+ msg = "range bounds violation";
+ goto out;
}
- if (tnum_is_const(reg->var_off)) {
- u64 uval = reg->var_off.value;
- s64 sval = (s64)uval;
-
- if (reg->umin_value != uval || reg->umax_value != uval ||
- reg->smin_value != sval || reg->smax_value != sval) {
- msg = "const tnum out of sync with range bounds";
- goto out;
- }
+ if (const_tnum_range_mismatch(reg)) {
+ msg = "const tnum out of sync with range bounds";
+ goto out;
}
- if (tnum_subreg_is_const(reg->var_off)) {
- u32 uval32 = tnum_subreg(reg->var_off).value;
- s32 sval32 = (s32)uval32;
-
- if (reg->u32_min_value != uval32 || reg->u32_max_value != uval32 ||
- reg->s32_min_value != sval32 || reg->s32_max_value != sval32) {
- msg = "const subreg tnum out of sync with range bounds";
- goto out;
- }
+ if (const_tnum_range_mismatch_32(reg)) {
+ msg = "const subreg tnum out of sync with range bounds";
+ goto out;
}
return 0;
@@ -2859,7 +2457,7 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
}
/* Mark a register as having a completely unknown (scalar) value. */
-static void __mark_reg_unknown_imprecise(struct bpf_reg_state *reg)
+void bpf_mark_reg_unknown_imprecise(struct bpf_reg_state *reg)
{
/*
* Clear type, off, and union(map_ptr, range) and
@@ -2881,20 +2479,13 @@ static void __mark_reg_unknown_imprecise(struct bpf_reg_state *reg)
static void __mark_reg_unknown(const struct bpf_verifier_env *env,
struct bpf_reg_state *reg)
{
- __mark_reg_unknown_imprecise(reg);
+ bpf_mark_reg_unknown_imprecise(reg);
reg->precise = !env->bpf_capable;
}
static void mark_reg_unknown(struct bpf_verifier_env *env,
struct bpf_reg_state *regs, u32 regno)
{
- if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
- /* Something bad happened, let's kill all regs except FP */
- for (regno = 0; regno < BPF_REG_FP; regno++)
- __mark_reg_not_init(env, regs + regno);
- return;
- }
__mark_reg_unknown(env, regs + regno);
}
@@ -2917,26 +2508,13 @@ static int __mark_reg_s32_range(struct bpf_verifier_env *env,
return reg_bounds_sanity_check(env, reg, "s32_range");
}
-static void __mark_reg_not_init(const struct bpf_verifier_env *env,
- struct bpf_reg_state *reg)
+void bpf_mark_reg_not_init(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
{
__mark_reg_unknown(env, reg);
reg->type = NOT_INIT;
}
-static void mark_reg_not_init(struct bpf_verifier_env *env,
- struct bpf_reg_state *regs, u32 regno)
-{
- if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
- /* Something bad happened, let's kill all regs except FP */
- for (regno = 0; regno < BPF_REG_FP; regno++)
- __mark_reg_not_init(env, regs + regno);
- return;
- }
- __mark_reg_not_init(env, regs + regno);
-}
-
static int mark_btf_ld_reg(struct bpf_verifier_env *env,
struct bpf_reg_state *regs, u32 regno,
enum bpf_reg_type reg_type,
@@ -2974,7 +2552,7 @@ static void init_reg_state(struct bpf_verifier_env *env,
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
- mark_reg_not_init(env, regs, i);
+ bpf_mark_reg_not_init(env, &regs[i]);
regs[i].subreg_def = DEF_NOT_SUBREG;
}
@@ -2986,10 +2564,13 @@ static void init_reg_state(struct bpf_verifier_env *env,
static struct bpf_retval_range retval_range(s32 minval, s32 maxval)
{
- return (struct bpf_retval_range){ minval, maxval };
+ /*
+ * return_32bit is set to false by default and set explicitly
+ * by the caller when necessary.
+ */
+ return (struct bpf_retval_range){ minval, maxval, false };
}
-#define BPF_MAIN_FUNC (-1)
static void init_func_state(struct bpf_verifier_env *env,
struct bpf_func_state *state,
int callsite, int frameno, int subprogno)
@@ -3026,7 +2607,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
env->stack_size);
return ERR_PTR(-E2BIG);
}
- /* Unlike push_stack() do not copy_verifier_state().
+ /* Unlike push_stack() do not bpf_copy_verifier_state().
* The caller state doesn't matter.
* This is async callback. It starts in a fresh stack.
* Initialize it similar to do_check_common().
@@ -3045,12 +2626,6 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
}
-enum reg_arg_type {
- SRC_OP, /* register is used as source operand */
- DST_OP, /* register is used as destination operand */
- DST_OP_NO_MARK /* same as above, check only, don't mark */
-};
-
static int cmp_subprogs(const void *a, const void *b)
{
return ((struct bpf_subprog_info *)a)->start -
@@ -3079,7 +2654,7 @@ struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *en
}
/* Find subprogram that starts exactly at 'off' */
-static int find_subprog(struct bpf_verifier_env *env, int off)
+int bpf_find_subprog(struct bpf_verifier_env *env, int off)
{
struct bpf_subprog_info *p;
@@ -3098,7 +2673,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
verbose(env, "call to invalid destination\n");
return -EINVAL;
}
- ret = find_subprog(env, off);
+ ret = bpf_find_subprog(env, off);
if (ret >= 0)
return ret;
if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
@@ -3174,41 +2749,19 @@ static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env)
return ret;
}
-#define MAX_KFUNC_DESCS 256
#define MAX_KFUNC_BTFS 256
-struct bpf_kfunc_desc {
- struct btf_func_model func_model;
- u32 func_id;
- s32 imm;
- u16 offset;
- unsigned long addr;
-};
-
struct bpf_kfunc_btf {
struct btf *btf;
struct module *module;
u16 offset;
};
-struct bpf_kfunc_desc_tab {
- /* Sorted by func_id (BTF ID) and offset (fd_array offset) during
- * verification. JITs do lookups by bpf_insn, where func_id may not be
- * available, therefore at the end of verification do_misc_fixups()
- * sorts this by imm and offset.
- */
- struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
- u32 nr_descs;
-};
-
struct bpf_kfunc_btf_tab {
struct bpf_kfunc_btf descs[MAX_KFUNC_BTFS];
u32 nr_descs;
};
-static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc,
- int insn_idx);
-
static int kfunc_desc_cmp_by_id_off(const void *a, const void *b)
{
const struct bpf_kfunc_desc *d0 = a;
@@ -3436,7 +2989,7 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env,
return 0;
}
-static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
+int bpf_add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, u16 offset)
{
struct bpf_kfunc_btf_tab *btf_tab;
struct btf_func_model func_model;
@@ -3531,95 +3084,11 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return 0;
}
-static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
-{
- const struct bpf_kfunc_desc *d0 = a;
- const struct bpf_kfunc_desc *d1 = b;
-
- if (d0->imm != d1->imm)
- return d0->imm < d1->imm ? -1 : 1;
- if (d0->offset != d1->offset)
- return d0->offset < d1->offset ? -1 : 1;
- return 0;
-}
-
-static int set_kfunc_desc_imm(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc)
-{
- unsigned long call_imm;
-
- if (bpf_jit_supports_far_kfunc_call()) {
- call_imm = desc->func_id;
- } else {
- call_imm = BPF_CALL_IMM(desc->addr);
- /* Check whether the relative offset overflows desc->imm */
- if ((unsigned long)(s32)call_imm != call_imm) {
- verbose(env, "address of kernel func_id %u is out of range\n",
- desc->func_id);
- return -EINVAL;
- }
- }
- desc->imm = call_imm;
- return 0;
-}
-
-static int sort_kfunc_descs_by_imm_off(struct bpf_verifier_env *env)
-{
- struct bpf_kfunc_desc_tab *tab;
- int i, err;
-
- tab = env->prog->aux->kfunc_tab;
- if (!tab)
- return 0;
-
- for (i = 0; i < tab->nr_descs; i++) {
- err = set_kfunc_desc_imm(env, &tab->descs[i]);
- if (err)
- return err;
- }
-
- sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
- kfunc_desc_cmp_by_imm_off, NULL);
- return 0;
-}
-
bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
{
return !!prog->aux->kfunc_tab;
}
-const struct btf_func_model *
-bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
- const struct bpf_insn *insn)
-{
- const struct bpf_kfunc_desc desc = {
- .imm = insn->imm,
- .offset = insn->off,
- };
- const struct bpf_kfunc_desc *res;
- struct bpf_kfunc_desc_tab *tab;
-
- tab = prog->aux->kfunc_tab;
- res = bsearch(&desc, tab->descs, tab->nr_descs,
- sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off);
-
- return res ? &res->func_model : NULL;
-}
-
-static int add_kfunc_in_insns(struct bpf_verifier_env *env,
- struct bpf_insn *insn, int cnt)
-{
- int i, ret;
-
- for (i = 0; i < cnt; i++, insn++) {
- if (bpf_pseudo_kfunc_call(insn)) {
- ret = add_kfunc_call(env, insn->imm, insn->off);
- if (ret < 0)
- return ret;
- }
- }
- return 0;
-}
-
static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
{
struct bpf_subprog_info *subprog = env->subprog_info;
@@ -3644,7 +3113,7 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
if (bpf_pseudo_func(insn) || bpf_pseudo_call(insn))
ret = add_subprog(env, i + insn->imm + 1);
else
- ret = add_kfunc_call(env, insn->imm, insn->off);
+ ret = bpf_add_kfunc_call(env, insn->imm, insn->off);
if (ret < 0)
return ret;
@@ -3666,7 +3135,7 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
if (env->subprog_info[i].start != ex_cb_insn)
continue;
env->exception_callback_subprog = i;
- mark_subprog_exc_cb(env, i);
+ bpf_mark_subprog_exc_cb(env, i);
break;
}
}
@@ -3739,17 +3208,101 @@ next:
return 0;
}
+/*
+ * Sort subprogs in topological order so that leaf subprogs come first and
+ * their callers come later. This is a DFS post-order traversal of the call
+ * graph. Scan only reachable instructions (those in the computed postorder) of
+ * the current subprog to discover callees (direct subprogs and sync
+ * callbacks).
+ */
+static int sort_subprogs_topo(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *si = env->subprog_info;
+ int *insn_postorder = env->cfg.insn_postorder;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int cnt = env->subprog_cnt;
+ int *dfs_stack = NULL;
+ int top = 0, order = 0;
+ int i, ret = 0;
+ u8 *color = NULL;
+
+ color = kvzalloc_objs(*color, cnt, GFP_KERNEL_ACCOUNT);
+ dfs_stack = kvmalloc_objs(*dfs_stack, cnt, GFP_KERNEL_ACCOUNT);
+ if (!color || !dfs_stack) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * DFS post-order traversal.
+ * Color values: 0 = unvisited, 1 = on stack, 2 = done.
+ */
+ for (i = 0; i < cnt; i++) {
+ if (color[i])
+ continue;
+ color[i] = 1;
+ dfs_stack[top++] = i;
+
+ while (top > 0) {
+ int cur = dfs_stack[top - 1];
+ int po_start = si[cur].postorder_start;
+ int po_end = si[cur + 1].postorder_start;
+ bool pushed = false;
+ int j;
+
+ for (j = po_start; j < po_end; j++) {
+ int idx = insn_postorder[j];
+ int callee;
+
+ if (!bpf_pseudo_call(&insn[idx]) && !bpf_pseudo_func(&insn[idx]))
+ continue;
+ callee = bpf_find_subprog(env, idx + insn[idx].imm + 1);
+ if (callee < 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (color[callee] == 2)
+ continue;
+ if (color[callee] == 1) {
+ if (bpf_pseudo_func(&insn[idx]))
+ continue;
+ verbose(env, "recursive call from %s() to %s()\n",
+ subprog_name(env, cur),
+ subprog_name(env, callee));
+ ret = -EINVAL;
+ goto out;
+ }
+ color[callee] = 1;
+ dfs_stack[top++] = callee;
+ pushed = true;
+ break;
+ }
+
+ if (!pushed) {
+ color[cur] = 2;
+ env->subprog_topo_order[order++] = cur;
+ top--;
+ }
+ }
+ }
+
+ if (env->log.level & BPF_LOG_LEVEL2)
+ for (i = 0; i < cnt; i++)
+ verbose(env, "topo_order[%d] = %s\n",
+ i, subprog_name(env, env->subprog_topo_order[i]));
+out:
+ kvfree(dfs_stack);
+ kvfree(color);
+ return ret;
+}
+
static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
int spi, int nr_slots)
{
- int err, i;
+ int i;
- for (i = 0; i < nr_slots; i++) {
- err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi - i));
- if (err)
- return err;
+ for (i = 0; i < nr_slots; i++)
mark_stack_slot_scratched(env, spi - i);
- }
return 0;
}
@@ -3793,8 +3346,8 @@ static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state
* code only. It returns TRUE if the source or destination register operates
* on 64-bit, otherwise return FALSE.
*/
-static bool is_reg64(struct bpf_insn *insn,
- u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
+bool bpf_is_reg64(struct bpf_insn *insn,
+ u32 regno, struct bpf_reg_state *reg, enum bpf_reg_arg_type t)
{
u8 code, class, op;
@@ -3879,41 +3432,6 @@ static bool is_reg64(struct bpf_insn *insn,
return true;
}
-/* Return the regno defined by the insn, or -1. */
-static int insn_def_regno(const struct bpf_insn *insn)
-{
- switch (BPF_CLASS(insn->code)) {
- case BPF_JMP:
- case BPF_JMP32:
- case BPF_ST:
- return -1;
- case BPF_STX:
- if (BPF_MODE(insn->code) == BPF_ATOMIC ||
- BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) {
- if (insn->imm == BPF_CMPXCHG)
- return BPF_REG_0;
- else if (insn->imm == BPF_LOAD_ACQ)
- return insn->dst_reg;
- else if (insn->imm & BPF_FETCH)
- return insn->src_reg;
- }
- return -1;
- default:
- return insn->dst_reg;
- }
-}
-
-/* Return TRUE if INSN has defined any 32-bit value explicitly. */
-static bool insn_has_def32(struct bpf_insn *insn)
-{
- int dst_reg = insn_def_regno(insn);
-
- if (dst_reg == -1)
- return false;
-
- return !is_reg64(insn, dst_reg, NULL, DST_OP);
-}
-
static void mark_insn_zext(struct bpf_verifier_env *env,
struct bpf_reg_state *reg)
{
@@ -3928,21 +3446,16 @@ static void mark_insn_zext(struct bpf_verifier_env *env,
}
static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno,
- enum reg_arg_type t)
+ enum bpf_reg_arg_type t)
{
struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
struct bpf_reg_state *reg;
bool rw64;
- if (regno >= MAX_BPF_REG) {
- verbose(env, "R%d is invalid\n", regno);
- return -EINVAL;
- }
-
mark_reg_scratched(env, regno);
reg = &regs[regno];
- rw64 = is_reg64(insn, regno, reg, t);
+ rw64 = bpf_is_reg64(insn, regno, reg, t);
if (t == SRC_OP) {
/* check whether register used as source operand can be read */
if (reg->type == NOT_INIT) {
@@ -3971,7 +3484,7 @@ static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *r
}
static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
- enum reg_arg_type t)
+ enum bpf_reg_arg_type t)
{
struct bpf_verifier_state *vstate = env->cur_state;
struct bpf_func_state *state = vstate->frame[vstate->curframe];
@@ -3984,26 +3497,6 @@ static int insn_stack_access_flags(int frameno, int spi)
return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno;
}
-static int insn_stack_access_spi(int insn_flags)
-{
- return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK;
-}
-
-static int insn_stack_access_frameno(int insn_flags)
-{
- return insn_flags & INSN_F_FRAMENO_MASK;
-}
-
-static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
-{
- env->insn_aux_data[idx].jmp_point = true;
-}
-
-static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
-{
- return env->insn_aux_data[insn_idx].jmp_point;
-}
-
#define LR_FRAMENO_BITS 3
#define LR_SPI_BITS 6
#define LR_ENTRY_BITS (LR_SPI_BITS + LR_FRAMENO_BITS + 1)
@@ -4082,91 +3575,6 @@ static void linked_regs_unpack(u64 val, struct linked_regs *s)
}
}
-/* for any branch, call, exit record the history of jmps in the given state */
-static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
- int insn_flags, u64 linked_regs)
-{
- u32 cnt = cur->jmp_history_cnt;
- struct bpf_jmp_history_entry *p;
- size_t alloc_size;
-
- /* combine instruction flags if we already recorded this instruction */
- if (env->cur_hist_ent) {
- /* atomic instructions push insn_flags twice, for READ and
- * WRITE sides, but they should agree on stack slot
- */
- verifier_bug_if((env->cur_hist_ent->flags & insn_flags) &&
- (env->cur_hist_ent->flags & insn_flags) != insn_flags,
- env, "insn history: insn_idx %d cur flags %x new flags %x",
- env->insn_idx, env->cur_hist_ent->flags, insn_flags);
- env->cur_hist_ent->flags |= insn_flags;
- verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env,
- "insn history: insn_idx %d linked_regs: %#llx",
- env->insn_idx, env->cur_hist_ent->linked_regs);
- env->cur_hist_ent->linked_regs = linked_regs;
- return 0;
- }
-
- cnt++;
- alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
- p = krealloc(cur->jmp_history, alloc_size, GFP_KERNEL_ACCOUNT);
- if (!p)
- return -ENOMEM;
- cur->jmp_history = p;
-
- p = &cur->jmp_history[cnt - 1];
- p->idx = env->insn_idx;
- p->prev_idx = env->prev_insn_idx;
- p->flags = insn_flags;
- p->linked_regs = linked_regs;
- cur->jmp_history_cnt = cnt;
- env->cur_hist_ent = p;
-
- return 0;
-}
-
-static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
- u32 hist_end, int insn_idx)
-{
- if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
- return &st->jmp_history[hist_end - 1];
- return NULL;
-}
-
-/* Backtrack one insn at a time. If idx is not at the top of recorded
- * history then previous instruction came from straight line execution.
- * Return -ENOENT if we exhausted all instructions within given state.
- *
- * It's legal to have a bit of a looping with the same starting and ending
- * insn index within the same state, e.g.: 3->4->5->3, so just because current
- * instruction index is the same as state's first_idx doesn't mean we are
- * done. If there is still some jump history left, we should keep going. We
- * need to take into account that we might have a jump history between given
- * state's parent and itself, due to checkpointing. In this case, we'll have
- * history entry recording a jump from last instruction of parent state and
- * first instruction of given state.
- */
-static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
- u32 *history)
-{
- u32 cnt = *history;
-
- if (i == st->first_insn_idx) {
- if (cnt == 0)
- return -ENOENT;
- if (cnt == 1 && st->jmp_history[0].idx == i)
- return -ENOENT;
- }
-
- if (cnt && st->jmp_history[cnt - 1].idx == i) {
- i = st->jmp_history[cnt - 1].prev_idx;
- (*history)--;
- } else {
- i--;
- }
- return i;
-}
-
static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
{
const struct btf_type *func;
@@ -4183,7 +3591,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
return btf_name_by_offset(desc_btf, func->name_off);
}
-static void verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn)
+void bpf_verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
const struct bpf_insn_cbs cbs = {
.cb_call = disasm_kfunc_name,
@@ -4194,158 +3602,10 @@ static void verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn)
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
}
-static inline void bt_init(struct backtrack_state *bt, u32 frame)
-{
- bt->frame = frame;
-}
-
-static inline void bt_reset(struct backtrack_state *bt)
-{
- struct bpf_verifier_env *env = bt->env;
-
- memset(bt, 0, sizeof(*bt));
- bt->env = env;
-}
-
-static inline u32 bt_empty(struct backtrack_state *bt)
-{
- u64 mask = 0;
- int i;
-
- for (i = 0; i <= bt->frame; i++)
- mask |= bt->reg_masks[i] | bt->stack_masks[i];
-
- return mask == 0;
-}
-
-static inline int bt_subprog_enter(struct backtrack_state *bt)
-{
- if (bt->frame == MAX_CALL_FRAMES - 1) {
- verifier_bug(bt->env, "subprog enter from frame %d", bt->frame);
- return -EFAULT;
- }
- bt->frame++;
- return 0;
-}
-
-static inline int bt_subprog_exit(struct backtrack_state *bt)
-{
- if (bt->frame == 0) {
- verifier_bug(bt->env, "subprog exit from frame 0");
- return -EFAULT;
- }
- bt->frame--;
- return 0;
-}
-
-static inline void bt_set_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
-{
- bt->reg_masks[frame] |= 1 << reg;
-}
-
-static inline void bt_clear_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
-{
- bt->reg_masks[frame] &= ~(1 << reg);
-}
-
-static inline void bt_set_reg(struct backtrack_state *bt, u32 reg)
-{
- bt_set_frame_reg(bt, bt->frame, reg);
-}
-
-static inline void bt_clear_reg(struct backtrack_state *bt, u32 reg)
-{
- bt_clear_frame_reg(bt, bt->frame, reg);
-}
-
-static inline void bt_set_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
-{
- bt->stack_masks[frame] |= 1ull << slot;
-}
-
-static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
-{
- bt->stack_masks[frame] &= ~(1ull << slot);
-}
-
-static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame)
-{
- return bt->reg_masks[frame];
-}
-
-static inline u32 bt_reg_mask(struct backtrack_state *bt)
-{
- return bt->reg_masks[bt->frame];
-}
-
-static inline u64 bt_frame_stack_mask(struct backtrack_state *bt, u32 frame)
-{
- return bt->stack_masks[frame];
-}
-
-static inline u64 bt_stack_mask(struct backtrack_state *bt)
-{
- return bt->stack_masks[bt->frame];
-}
-
-static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
-{
- return bt->reg_masks[bt->frame] & (1 << reg);
-}
-
-static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg)
-{
- return bt->reg_masks[frame] & (1 << reg);
-}
-
-static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
-{
- return bt->stack_masks[frame] & (1ull << slot);
-}
-
-/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */
-static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask)
-{
- DECLARE_BITMAP(mask, 64);
- bool first = true;
- int i, n;
-
- buf[0] = '\0';
-
- bitmap_from_u64(mask, reg_mask);
- for_each_set_bit(i, mask, 32) {
- n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i);
- first = false;
- buf += n;
- buf_sz -= n;
- if (buf_sz < 0)
- break;
- }
-}
-/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */
-void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
-{
- DECLARE_BITMAP(mask, 64);
- bool first = true;
- int i, n;
-
- buf[0] = '\0';
-
- bitmap_from_u64(mask, stack_mask);
- for_each_set_bit(i, mask, 64) {
- n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8);
- first = false;
- buf += n;
- buf_sz -= n;
- if (buf_sz < 0)
- break;
- }
-}
-
/* If any register R in hist->linked_regs is marked as precise in bt,
* do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs.
*/
-static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist)
+void bpf_bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist)
{
struct linked_regs linked_regs;
bool some_precise = false;
@@ -4372,727 +3632,15 @@ static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_histo
struct linked_reg *e = &linked_regs.entries[i];
if (e->is_reg)
- bt_set_frame_reg(bt, e->frameno, e->regno);
+ bpf_bt_set_frame_reg(bt, e->frameno, e->regno);
else
- bt_set_frame_slot(bt, e->frameno, e->spi);
- }
-}
-
-/* For given verifier state backtrack_insn() is called from the last insn to
- * the first insn. Its purpose is to compute a bitmask of registers and
- * stack slots that needs precision in the parent verifier state.
- *
- * @idx is an index of the instruction we are currently processing;
- * @subseq_idx is an index of the subsequent instruction that:
- * - *would be* executed next, if jump history is viewed in forward order;
- * - *was* processed previously during backtracking.
- */
-static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
- struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
-{
- struct bpf_insn *insn = env->prog->insnsi + idx;
- u8 class = BPF_CLASS(insn->code);
- u8 opcode = BPF_OP(insn->code);
- u8 mode = BPF_MODE(insn->code);
- u32 dreg = insn->dst_reg;
- u32 sreg = insn->src_reg;
- u32 spi, i, fr;
-
- if (insn->code == 0)
- return 0;
- if (env->log.level & BPF_LOG_LEVEL2) {
- fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt));
- verbose(env, "mark_precise: frame%d: regs=%s ",
- bt->frame, env->tmp_str_buf);
- bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
- verbose(env, "stack=%s before ", env->tmp_str_buf);
- verbose(env, "%d: ", idx);
- verbose_insn(env, insn);
- }
-
- /* If there is a history record that some registers gained range at this insn,
- * propagate precision marks to those registers, so that bt_is_reg_set()
- * accounts for these registers.
- */
- bt_sync_linked_regs(bt, hist);
-
- if (class == BPF_ALU || class == BPF_ALU64) {
- if (!bt_is_reg_set(bt, dreg))
- return 0;
- if (opcode == BPF_END || opcode == BPF_NEG) {
- /* sreg is reserved and unused
- * dreg still need precision before this insn
- */
- return 0;
- } else if (opcode == BPF_MOV) {
- if (BPF_SRC(insn->code) == BPF_X) {
- /* dreg = sreg or dreg = (s8, s16, s32)sreg
- * dreg needs precision after this insn
- * sreg needs precision before this insn
- */
- bt_clear_reg(bt, dreg);
- if (sreg != BPF_REG_FP)
- bt_set_reg(bt, sreg);
- } else {
- /* dreg = K
- * dreg needs precision after this insn.
- * Corresponding register is already marked
- * as precise=true in this verifier state.
- * No further markings in parent are necessary
- */
- bt_clear_reg(bt, dreg);
- }
- } else {
- if (BPF_SRC(insn->code) == BPF_X) {
- /* dreg += sreg
- * both dreg and sreg need precision
- * before this insn
- */
- if (sreg != BPF_REG_FP)
- bt_set_reg(bt, sreg);
- } /* else dreg += K
- * dreg still needs precision before this insn
- */
- }
- } else if (class == BPF_LDX ||
- is_atomic_load_insn(insn) ||
- is_atomic_fetch_insn(insn)) {
- u32 load_reg = dreg;
-
- /*
- * Atomic fetch operation writes the old value into
- * a register (sreg or r0) and if it was tracked for
- * precision, propagate to the stack slot like we do
- * in regular ldx.
- */
- if (is_atomic_fetch_insn(insn))
- load_reg = insn->imm == BPF_CMPXCHG ?
- BPF_REG_0 : sreg;
-
- if (!bt_is_reg_set(bt, load_reg))
- return 0;
- bt_clear_reg(bt, load_reg);
-
- /* scalars can only be spilled into stack w/o losing precision.
- * Load from any other memory can be zero extended.
- * The desire to keep that precision is already indicated
- * by 'precise' mark in corresponding register of this state.
- * No further tracking necessary.
- */
- if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
- return 0;
- /* dreg = *(u64 *)[fp - off] was a fill from the stack.
- * that [fp - off] slot contains scalar that needs to be
- * tracked with precision
- */
- spi = insn_stack_access_spi(hist->flags);
- fr = insn_stack_access_frameno(hist->flags);
- bt_set_frame_slot(bt, fr, spi);
- } else if (class == BPF_STX || class == BPF_ST) {
- if (bt_is_reg_set(bt, dreg))
- /* stx & st shouldn't be using _scalar_ dst_reg
- * to access memory. It means backtracking
- * encountered a case of pointer subtraction.
- */
- return -ENOTSUPP;
- /* scalars can only be spilled into stack */
- if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
- return 0;
- spi = insn_stack_access_spi(hist->flags);
- fr = insn_stack_access_frameno(hist->flags);
- if (!bt_is_frame_slot_set(bt, fr, spi))
- return 0;
- bt_clear_frame_slot(bt, fr, spi);
- if (class == BPF_STX)
- bt_set_reg(bt, sreg);
- } else if (class == BPF_JMP || class == BPF_JMP32) {
- if (bpf_pseudo_call(insn)) {
- int subprog_insn_idx, subprog;
-
- subprog_insn_idx = idx + insn->imm + 1;
- subprog = find_subprog(env, subprog_insn_idx);
- if (subprog < 0)
- return -EFAULT;
-
- if (subprog_is_global(env, subprog)) {
- /* check that jump history doesn't have any
- * extra instructions from subprog; the next
- * instruction after call to global subprog
- * should be literally next instruction in
- * caller program
- */
- verifier_bug_if(idx + 1 != subseq_idx, env,
- "extra insn from subprog");
- /* r1-r5 are invalidated after subprog call,
- * so for global func call it shouldn't be set
- * anymore
- */
- if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
- verifier_bug(env, "global subprog unexpected regs %x",
- bt_reg_mask(bt));
- return -EFAULT;
- }
- /* global subprog always sets R0 */
- bt_clear_reg(bt, BPF_REG_0);
- return 0;
- } else {
- /* static subprog call instruction, which
- * means that we are exiting current subprog,
- * so only r1-r5 could be still requested as
- * precise, r0 and r6-r10 or any stack slot in
- * the current frame should be zero by now
- */
- if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
- verifier_bug(env, "static subprog unexpected regs %x",
- bt_reg_mask(bt));
- return -EFAULT;
- }
- /* we are now tracking register spills correctly,
- * so any instance of leftover slots is a bug
- */
- if (bt_stack_mask(bt) != 0) {
- verifier_bug(env,
- "static subprog leftover stack slots %llx",
- bt_stack_mask(bt));
- return -EFAULT;
- }
- /* propagate r1-r5 to the caller */
- for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
- if (bt_is_reg_set(bt, i)) {
- bt_clear_reg(bt, i);
- bt_set_frame_reg(bt, bt->frame - 1, i);
- }
- }
- if (bt_subprog_exit(bt))
- return -EFAULT;
- return 0;
- }
- } else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) {
- /* exit from callback subprog to callback-calling helper or
- * kfunc call. Use idx/subseq_idx check to discern it from
- * straight line code backtracking.
- * Unlike the subprog call handling above, we shouldn't
- * propagate precision of r1-r5 (if any requested), as they are
- * not actually arguments passed directly to callback subprogs
- */
- if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
- verifier_bug(env, "callback unexpected regs %x",
- bt_reg_mask(bt));
- return -EFAULT;
- }
- if (bt_stack_mask(bt) != 0) {
- verifier_bug(env, "callback leftover stack slots %llx",
- bt_stack_mask(bt));
- return -EFAULT;
- }
- /* clear r1-r5 in callback subprog's mask */
- for (i = BPF_REG_1; i <= BPF_REG_5; i++)
- bt_clear_reg(bt, i);
- if (bt_subprog_exit(bt))
- return -EFAULT;
- return 0;
- } else if (opcode == BPF_CALL) {
- /* kfunc with imm==0 is invalid and fixup_kfunc_call will
- * catch this error later. Make backtracking conservative
- * with ENOTSUPP.
- */
- if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
- return -ENOTSUPP;
- /* regular helper call sets R0 */
- bt_clear_reg(bt, BPF_REG_0);
- if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
- /* if backtracking was looking for registers R1-R5
- * they should have been found already.
- */
- verifier_bug(env, "backtracking call unexpected regs %x",
- bt_reg_mask(bt));
- return -EFAULT;
- }
- if (insn->src_reg == BPF_REG_0 && insn->imm == BPF_FUNC_tail_call
- && subseq_idx - idx != 1) {
- if (bt_subprog_enter(bt))
- return -EFAULT;
- }
- } else if (opcode == BPF_EXIT) {
- bool r0_precise;
-
- /* Backtracking to a nested function call, 'idx' is a part of
- * the inner frame 'subseq_idx' is a part of the outer frame.
- * In case of a regular function call, instructions giving
- * precision to registers R1-R5 should have been found already.
- * In case of a callback, it is ok to have R1-R5 marked for
- * backtracking, as these registers are set by the function
- * invoking callback.
- */
- if (subseq_idx >= 0 && bpf_calls_callback(env, subseq_idx))
- for (i = BPF_REG_1; i <= BPF_REG_5; i++)
- bt_clear_reg(bt, i);
- if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
- verifier_bug(env, "backtracking exit unexpected regs %x",
- bt_reg_mask(bt));
- return -EFAULT;
- }
-
- /* BPF_EXIT in subprog or callback always returns
- * right after the call instruction, so by checking
- * whether the instruction at subseq_idx-1 is subprog
- * call or not we can distinguish actual exit from
- * *subprog* from exit from *callback*. In the former
- * case, we need to propagate r0 precision, if
- * necessary. In the former we never do that.
- */
- r0_precise = subseq_idx - 1 >= 0 &&
- bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) &&
- bt_is_reg_set(bt, BPF_REG_0);
-
- bt_clear_reg(bt, BPF_REG_0);
- if (bt_subprog_enter(bt))
- return -EFAULT;
-
- if (r0_precise)
- bt_set_reg(bt, BPF_REG_0);
- /* r6-r9 and stack slots will stay set in caller frame
- * bitmasks until we return back from callee(s)
- */
- return 0;
- } else if (BPF_SRC(insn->code) == BPF_X) {
- if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg))
- return 0;
- /* dreg <cond> sreg
- * Both dreg and sreg need precision before
- * this insn. If only sreg was marked precise
- * before it would be equally necessary to
- * propagate it to dreg.
- */
- if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK))
- bt_set_reg(bt, sreg);
- if (!hist || !(hist->flags & INSN_F_DST_REG_STACK))
- bt_set_reg(bt, dreg);
- } else if (BPF_SRC(insn->code) == BPF_K) {
- /* dreg <cond> K
- * Only dreg still needs precision before
- * this insn, so for the K-based conditional
- * there is nothing new to be marked.
- */
- }
- } else if (class == BPF_LD) {
- if (!bt_is_reg_set(bt, dreg))
- return 0;
- bt_clear_reg(bt, dreg);
- /* It's ld_imm64 or ld_abs or ld_ind.
- * For ld_imm64 no further tracking of precision
- * into parent is necessary
- */
- if (mode == BPF_IND || mode == BPF_ABS)
- /* to be analyzed */
- return -ENOTSUPP;
- }
- /* Propagate precision marks to linked registers, to account for
- * registers marked as precise in this function.
- */
- bt_sync_linked_regs(bt, hist);
- return 0;
-}
-
-/* the scalar precision tracking algorithm:
- * . at the start all registers have precise=false.
- * . scalar ranges are tracked as normal through alu and jmp insns.
- * . once precise value of the scalar register is used in:
- * . ptr + scalar alu
- * . if (scalar cond K|scalar)
- * . helper_call(.., scalar, ...) where ARG_CONST is expected
- * backtrack through the verifier states and mark all registers and
- * stack slots with spilled constants that these scalar registers
- * should be precise.
- * . during state pruning two registers (or spilled stack slots)
- * are equivalent if both are not precise.
- *
- * Note the verifier cannot simply walk register parentage chain,
- * since many different registers and stack slots could have been
- * used to compute single precise scalar.
- *
- * The approach of starting with precise=true for all registers and then
- * backtrack to mark a register as not precise when the verifier detects
- * that program doesn't care about specific value (e.g., when helper
- * takes register as ARG_ANYTHING parameter) is not safe.
- *
- * It's ok to walk single parentage chain of the verifier states.
- * It's possible that this backtracking will go all the way till 1st insn.
- * All other branches will be explored for needing precision later.
- *
- * The backtracking needs to deal with cases like:
- * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0)
- * r9 -= r8
- * r5 = r9
- * if r5 > 0x79f goto pc+7
- * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff))
- * r5 += 1
- * ...
- * call bpf_perf_event_output#25
- * where .arg5_type = ARG_CONST_SIZE_OR_ZERO
- *
- * and this case:
- * r6 = 1
- * call foo // uses callee's r6 inside to compute r0
- * r0 += r6
- * if r0 == 0 goto
- *
- * to track above reg_mask/stack_mask needs to be independent for each frame.
- *
- * Also if parent's curframe > frame where backtracking started,
- * the verifier need to mark registers in both frames, otherwise callees
- * may incorrectly prune callers. This is similar to
- * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences")
- *
- * For now backtracking falls back into conservative marking.
- */
-static void mark_all_scalars_precise(struct bpf_verifier_env *env,
- struct bpf_verifier_state *st)
-{
- struct bpf_func_state *func;
- struct bpf_reg_state *reg;
- int i, j;
-
- if (env->log.level & BPF_LOG_LEVEL2) {
- verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n",
- st->curframe);
- }
-
- /* big hammer: mark all scalars precise in this path.
- * pop_stack may still get !precise scalars.
- * We also skip current state and go straight to first parent state,
- * because precision markings in current non-checkpointed state are
- * not needed. See why in the comment in __mark_chain_precision below.
- */
- for (st = st->parent; st; st = st->parent) {
- for (i = 0; i <= st->curframe; i++) {
- func = st->frame[i];
- for (j = 0; j < BPF_REG_FP; j++) {
- reg = &func->regs[j];
- if (reg->type != SCALAR_VALUE || reg->precise)
- continue;
- reg->precise = true;
- if (env->log.level & BPF_LOG_LEVEL2) {
- verbose(env, "force_precise: frame%d: forcing r%d to be precise\n",
- i, j);
- }
- }
- for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
- if (!is_spilled_reg(&func->stack[j]))
- continue;
- reg = &func->stack[j].spilled_ptr;
- if (reg->type != SCALAR_VALUE || reg->precise)
- continue;
- reg->precise = true;
- if (env->log.level & BPF_LOG_LEVEL2) {
- verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n",
- i, -(j + 1) * 8);
- }
- }
- }
- }
-}
-
-static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
-{
- struct bpf_func_state *func;
- struct bpf_reg_state *reg;
- int i, j;
-
- for (i = 0; i <= st->curframe; i++) {
- func = st->frame[i];
- for (j = 0; j < BPF_REG_FP; j++) {
- reg = &func->regs[j];
- if (reg->type != SCALAR_VALUE)
- continue;
- reg->precise = false;
- }
- for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
- if (!is_spilled_reg(&func->stack[j]))
- continue;
- reg = &func->stack[j].spilled_ptr;
- if (reg->type != SCALAR_VALUE)
- continue;
- reg->precise = false;
- }
- }
-}
-
-/*
- * __mark_chain_precision() backtracks BPF program instruction sequence and
- * chain of verifier states making sure that register *regno* (if regno >= 0)
- * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
- * SCALARS, as well as any other registers and slots that contribute to
- * a tracked state of given registers/stack slots, depending on specific BPF
- * assembly instructions (see backtrack_insns() for exact instruction handling
- * logic). This backtracking relies on recorded jmp_history and is able to
- * traverse entire chain of parent states. This process ends only when all the
- * necessary registers/slots and their transitive dependencies are marked as
- * precise.
- *
- * One important and subtle aspect is that precise marks *do not matter* in
- * the currently verified state (current state). It is important to understand
- * why this is the case.
- *
- * First, note that current state is the state that is not yet "checkpointed",
- * i.e., it is not yet put into env->explored_states, and it has no children
- * states as well. It's ephemeral, and can end up either a) being discarded if
- * compatible explored state is found at some point or BPF_EXIT instruction is
- * reached or b) checkpointed and put into env->explored_states, branching out
- * into one or more children states.
- *
- * In the former case, precise markings in current state are completely
- * ignored by state comparison code (see regsafe() for details). Only
- * checkpointed ("old") state precise markings are important, and if old
- * state's register/slot is precise, regsafe() assumes current state's
- * register/slot as precise and checks value ranges exactly and precisely. If
- * states turn out to be compatible, current state's necessary precise
- * markings and any required parent states' precise markings are enforced
- * after the fact with propagate_precision() logic, after the fact. But it's
- * important to realize that in this case, even after marking current state
- * registers/slots as precise, we immediately discard current state. So what
- * actually matters is any of the precise markings propagated into current
- * state's parent states, which are always checkpointed (due to b) case above).
- * As such, for scenario a) it doesn't matter if current state has precise
- * markings set or not.
- *
- * Now, for the scenario b), checkpointing and forking into child(ren)
- * state(s). Note that before current state gets to checkpointing step, any
- * processed instruction always assumes precise SCALAR register/slot
- * knowledge: if precise value or range is useful to prune jump branch, BPF
- * verifier takes this opportunity enthusiastically. Similarly, when
- * register's value is used to calculate offset or memory address, exact
- * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
- * what we mentioned above about state comparison ignoring precise markings
- * during state comparison, BPF verifier ignores and also assumes precise
- * markings *at will* during instruction verification process. But as verifier
- * assumes precision, it also propagates any precision dependencies across
- * parent states, which are not yet finalized, so can be further restricted
- * based on new knowledge gained from restrictions enforced by their children
- * states. This is so that once those parent states are finalized, i.e., when
- * they have no more active children state, state comparison logic in
- * is_state_visited() would enforce strict and precise SCALAR ranges, if
- * required for correctness.
- *
- * To build a bit more intuition, note also that once a state is checkpointed,
- * the path we took to get to that state is not important. This is crucial
- * property for state pruning. When state is checkpointed and finalized at
- * some instruction index, it can be correctly and safely used to "short
- * circuit" any *compatible* state that reaches exactly the same instruction
- * index. I.e., if we jumped to that instruction from a completely different
- * code path than original finalized state was derived from, it doesn't
- * matter, current state can be discarded because from that instruction
- * forward having a compatible state will ensure we will safely reach the
- * exit. States describe preconditions for further exploration, but completely
- * forget the history of how we got here.
- *
- * This also means that even if we needed precise SCALAR range to get to
- * finalized state, but from that point forward *that same* SCALAR register is
- * never used in a precise context (i.e., it's precise value is not needed for
- * correctness), it's correct and safe to mark such register as "imprecise"
- * (i.e., precise marking set to false). This is what we rely on when we do
- * not set precise marking in current state. If no child state requires
- * precision for any given SCALAR register, it's safe to dictate that it can
- * be imprecise. If any child state does require this register to be precise,
- * we'll mark it precise later retroactively during precise markings
- * propagation from child state to parent states.
- *
- * Skipping precise marking setting in current state is a mild version of
- * relying on the above observation. But we can utilize this property even
- * more aggressively by proactively forgetting any precise marking in the
- * current state (which we inherited from the parent state), right before we
- * checkpoint it and branch off into new child state. This is done by
- * mark_all_scalars_imprecise() to hopefully get more permissive and generic
- * finalized states which help in short circuiting more future states.
- */
-static int __mark_chain_precision(struct bpf_verifier_env *env,
- struct bpf_verifier_state *starting_state,
- int regno,
- bool *changed)
-{
- struct bpf_verifier_state *st = starting_state;
- struct backtrack_state *bt = &env->bt;
- int first_idx = st->first_insn_idx;
- int last_idx = starting_state->insn_idx;
- int subseq_idx = -1;
- struct bpf_func_state *func;
- bool tmp, skip_first = true;
- struct bpf_reg_state *reg;
- int i, fr, err;
-
- if (!env->bpf_capable)
- return 0;
-
- changed = changed ?: &tmp;
- /* set frame number from which we are starting to backtrack */
- bt_init(bt, starting_state->curframe);
-
- /* Do sanity checks against current state of register and/or stack
- * slot, but don't set precise flag in current state, as precision
- * tracking in the current state is unnecessary.
- */
- func = st->frame[bt->frame];
- if (regno >= 0) {
- reg = &func->regs[regno];
- if (reg->type != SCALAR_VALUE) {
- verifier_bug(env, "backtracking misuse");
- return -EFAULT;
- }
- bt_set_reg(bt, regno);
- }
-
- if (bt_empty(bt))
- return 0;
-
- for (;;) {
- DECLARE_BITMAP(mask, 64);
- u32 history = st->jmp_history_cnt;
- struct bpf_jmp_history_entry *hist;
-
- if (env->log.level & BPF_LOG_LEVEL2) {
- verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
- bt->frame, last_idx, first_idx, subseq_idx);
- }
-
- if (last_idx < 0) {
- /* we are at the entry into subprog, which
- * is expected for global funcs, but only if
- * requested precise registers are R1-R5
- * (which are global func's input arguments)
- */
- if (st->curframe == 0 &&
- st->frame[0]->subprogno > 0 &&
- st->frame[0]->callsite == BPF_MAIN_FUNC &&
- bt_stack_mask(bt) == 0 &&
- (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) {
- bitmap_from_u64(mask, bt_reg_mask(bt));
- for_each_set_bit(i, mask, 32) {
- reg = &st->frame[0]->regs[i];
- bt_clear_reg(bt, i);
- if (reg->type == SCALAR_VALUE) {
- reg->precise = true;
- *changed = true;
- }
- }
- return 0;
- }
-
- verifier_bug(env, "backtracking func entry subprog %d reg_mask %x stack_mask %llx",
- st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
- return -EFAULT;
- }
-
- for (i = last_idx;;) {
- if (skip_first) {
- err = 0;
- skip_first = false;
- } else {
- hist = get_jmp_hist_entry(st, history, i);
- err = backtrack_insn(env, i, subseq_idx, hist, bt);
- }
- if (err == -ENOTSUPP) {
- mark_all_scalars_precise(env, starting_state);
- bt_reset(bt);
- return 0;
- } else if (err) {
- return err;
- }
- if (bt_empty(bt))
- /* Found assignment(s) into tracked register in this state.
- * Since this state is already marked, just return.
- * Nothing to be tracked further in the parent state.
- */
- return 0;
- subseq_idx = i;
- i = get_prev_insn_idx(st, i, &history);
- if (i == -ENOENT)
- break;
- if (i >= env->prog->len) {
- /* This can happen if backtracking reached insn 0
- * and there are still reg_mask or stack_mask
- * to backtrack.
- * It means the backtracking missed the spot where
- * particular register was initialized with a constant.
- */
- verifier_bug(env, "backtracking idx %d", i);
- return -EFAULT;
- }
- }
- st = st->parent;
- if (!st)
- break;
-
- for (fr = bt->frame; fr >= 0; fr--) {
- func = st->frame[fr];
- bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
- for_each_set_bit(i, mask, 32) {
- reg = &func->regs[i];
- if (reg->type != SCALAR_VALUE) {
- bt_clear_frame_reg(bt, fr, i);
- continue;
- }
- if (reg->precise) {
- bt_clear_frame_reg(bt, fr, i);
- } else {
- reg->precise = true;
- *changed = true;
- }
- }
-
- bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
- for_each_set_bit(i, mask, 64) {
- if (verifier_bug_if(i >= func->allocated_stack / BPF_REG_SIZE,
- env, "stack slot %d, total slots %d",
- i, func->allocated_stack / BPF_REG_SIZE))
- return -EFAULT;
-
- if (!is_spilled_scalar_reg(&func->stack[i])) {
- bt_clear_frame_slot(bt, fr, i);
- continue;
- }
- reg = &func->stack[i].spilled_ptr;
- if (reg->precise) {
- bt_clear_frame_slot(bt, fr, i);
- } else {
- reg->precise = true;
- *changed = true;
- }
- }
- if (env->log.level & BPF_LOG_LEVEL2) {
- fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
- bt_frame_reg_mask(bt, fr));
- verbose(env, "mark_precise: frame%d: parent state regs=%s ",
- fr, env->tmp_str_buf);
- bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
- bt_frame_stack_mask(bt, fr));
- verbose(env, "stack=%s: ", env->tmp_str_buf);
- print_verifier_state(env, st, fr, true);
- }
- }
-
- if (bt_empty(bt))
- return 0;
-
- subseq_idx = first_idx;
- last_idx = st->last_insn_idx;
- first_idx = st->first_insn_idx;
+ bpf_bt_set_frame_slot(bt, e->frameno, e->spi);
}
-
- /* if we still have requested precise regs or slots, we missed
- * something (e.g., stack access through non-r10 register), so
- * fallback to marking all precise
- */
- if (!bt_empty(bt)) {
- mark_all_scalars_precise(env, starting_state);
- bt_reset(bt);
- }
-
- return 0;
}
int mark_chain_precision(struct bpf_verifier_env *env, int regno)
{
- return __mark_chain_precision(env, env->cur_state, regno, NULL);
+ return bpf_mark_chain_precision(env, env->cur_state, regno, NULL);
}
/* mark_chain_precision_batch() assumes that env->bt is set in the caller to
@@ -5101,7 +3649,7 @@ int mark_chain_precision(struct bpf_verifier_env *env, int regno)
static int mark_chain_precision_batch(struct bpf_verifier_env *env,
struct bpf_verifier_state *starting_state)
{
- return __mark_chain_precision(env, starting_state, -1, NULL);
+ return bpf_mark_chain_precision(env, starting_state, -1, NULL);
}
static bool is_spillable_regtype(enum bpf_reg_type type)
@@ -5131,11 +3679,6 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
}
}
-/* Does this register contain a constant zero? */
-static bool register_is_null(struct bpf_reg_state *reg)
-{
- return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
-}
/* check if register is a constant scalar value */
static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32)
@@ -5159,27 +3702,30 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
return reg->type != SCALAR_VALUE;
}
+static void clear_scalar_id(struct bpf_reg_state *reg)
+{
+ reg->id = 0;
+ reg->delta = 0;
+}
+
static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,
struct bpf_reg_state *src_reg)
{
if (src_reg->type != SCALAR_VALUE)
return;
-
- if (src_reg->id & BPF_ADD_CONST) {
- /*
- * The verifier is processing rX = rY insn and
- * rY->id has special linked register already.
- * Cleared it, since multiple rX += const are not supported.
- */
- src_reg->id = 0;
- src_reg->off = 0;
- }
-
+ /*
+ * The verifier is processing rX = rY insn and
+ * rY->id has special linked register already.
+ * Cleared it, since multiple rX += const are not supported.
+ */
+ if (src_reg->id & BPF_ADD_CONST)
+ clear_scalar_id(src_reg);
+ /*
+ * Ensure that src_reg has a valid ID that will be copied to
+ * dst_reg and then will be used by sync_linked_regs() to
+ * propagate min/max range.
+ */
if (!src_reg->id && !tnum_is_const(src_reg->var_off))
- /* Ensure that src_reg has a valid ID that will be copied to
- * dst_reg and then will be used by sync_linked_regs() to
- * propagate min/max range.
- */
src_reg->id = ++env->id_gen;
}
@@ -5241,6 +3787,18 @@ static void check_fastcall_stack_contract(struct bpf_verifier_env *env,
}
}
+static void scrub_special_slot(struct bpf_func_state *state, int spi)
+{
+ int i;
+
+ /* regular write of data into stack destroys any spilled ptr */
+ state->stack[spi].spilled_ptr.type = NOT_INIT;
+ /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
+ if (is_stack_slot_special(&state->stack[spi]))
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ scrub_spilled_slot(&state->stack[spi].slot_type[i]);
+}
+
/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
@@ -5260,8 +3818,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
* so it's aligned access and [off, off + size) are within stack limits
*/
if (!env->allow_ptr_leaks &&
- is_spilled_reg(&state->stack[spi]) &&
- !is_spilled_scalar_reg(&state->stack[spi]) &&
+ bpf_is_spilled_reg(&state->stack[spi]) &&
+ !bpf_is_spilled_scalar_reg(&state->stack[spi]) &&
size != BPF_REG_SIZE) {
verbose(env, "attempt to corrupt spilled pointer on stack\n");
return -EACCES;
@@ -5290,18 +3848,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
if (err)
return err;
- if (!(off % BPF_REG_SIZE) && size == BPF_REG_SIZE) {
- /* only mark the slot as written if all 8 bytes were written
- * otherwise read propagation may incorrectly stop too soon
- * when stack slots are partially written.
- * This heuristic means that read propagation will be
- * conservative, since it will add reg_live_read marks
- * to stack slots all the way to first state when programs
- * writes+reads less than 8 bytes
- */
- bpf_mark_stack_write(env, state->frameno, BIT(spi));
- }
-
check_fastcall_stack_contract(env, state, insn_idx, off);
mark_stack_slot_scratched(env, spi);
if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) {
@@ -5338,15 +3884,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
} else {
u8 type = STACK_MISC;
- /* regular write of data into stack destroys any spilled ptr */
- state->stack[spi].spilled_ptr.type = NOT_INIT;
- /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
- if (is_stack_slot_special(&state->stack[spi]))
- for (i = 0; i < BPF_REG_SIZE; i++)
- scrub_spilled_slot(&state->stack[spi].slot_type[i]);
+ scrub_special_slot(state, spi);
/* when we zero initialize stack slots mark them as such */
- if ((reg && register_is_null(reg)) ||
+ if ((reg && bpf_register_is_null(reg)) ||
(!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
/* STACK_ZERO case happened because register spill
* wasn't properly aligned at the stack slot boundary,
@@ -5367,7 +3908,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
}
if (insn_flags)
- return push_jmp_history(env, env->cur_state, insn_flags, 0);
+ return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0);
return 0;
}
@@ -5377,7 +3918,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
* tracks the effects of the write, considering that each stack slot in the
* dynamic range is potentially written to.
*
- * 'off' includes 'regno->off'.
* 'value_regno' can be -1, meaning that an unknown value is being written to
* the stack.
*
@@ -5413,14 +3953,14 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
max_off = ptr_reg->smax_value + off + size;
if (value_regno >= 0)
value_reg = &cur->regs[value_regno];
- if ((value_reg && register_is_null(value_reg)) ||
+ if ((value_reg && bpf_register_is_null(value_reg)) ||
(!value_reg && is_bpf_st_mem(insn) && insn->imm == 0))
writing_zero = true;
for (i = min_off; i < max_off; i++) {
int spi;
- spi = __get_spi(i);
+ spi = bpf_get_spi(i);
err = destroy_if_dynptr_stack_slot(env, state, spi);
if (err)
return err;
@@ -5458,7 +3998,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
* maintain the spill type.
*/
if (writing_zero && *stype == STACK_SPILL &&
- is_spilled_scalar_reg(&state->stack[spi])) {
+ bpf_is_spilled_scalar_reg(&state->stack[spi])) {
struct bpf_reg_state *spill_reg = &state->stack[spi].spilled_ptr;
if (tnum_is_const(spill_reg->var_off) && spill_reg->var_off.value == 0) {
@@ -5467,8 +4007,13 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
}
}
- /* Erase all other spilled pointers. */
- state->stack[spi].spilled_ptr.type = NOT_INIT;
+ /*
+ * Scrub slots if variable-offset stack write goes over spilled pointers.
+ * Otherwise bpf_is_spilled_reg() may == true && spilled_ptr.type == NOT_INIT
+ * and valid program is rejected by check_stack_read_fixed_off()
+ * with obscure "invalid size of register fill" message.
+ */
+ scrub_special_slot(state, spi);
/* Update the slot type. */
new_type = STACK_MISC;
@@ -5483,8 +4028,10 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
* For privileged programs, we will accept such reads to slots
* that may or may not be written because, if we're reject
* them, the error would be too confusing.
+ * Conservatively, treat STACK_POISON in a similar way.
*/
- if (*stype == STACK_INVALID && !env->allow_uninit_stack) {
+ if ((*stype == STACK_INVALID || *stype == STACK_POISON) &&
+ !env->allow_uninit_stack) {
verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d",
insn_idx, i);
return -EINVAL;
@@ -5559,18 +4106,14 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
struct bpf_reg_state *reg;
u8 *stype, type;
int insn_flags = insn_stack_access_flags(reg_state->frameno, spi);
- int err;
stype = reg_state->stack[spi].slot_type;
reg = &reg_state->stack[spi].spilled_ptr;
mark_stack_slot_scratched(env, spi);
check_fastcall_stack_contract(env, state, env->insn_idx, off);
- err = bpf_mark_stack_read(env, reg_state->frameno, env->insn_idx, BIT(spi));
- if (err)
- return err;
- if (is_spilled_reg(&reg_state->stack[spi])) {
+ if (bpf_is_spilled_reg(&reg_state->stack[spi])) {
u8 spill_size = 1;
for (i = BPF_REG_SIZE - 1; i > 0 && stype[i - 1] == STACK_SPILL; i--)
@@ -5606,7 +4149,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
* coerce_reg_to_size will adjust the boundaries.
*/
if (get_reg_width(reg) > size * BITS_PER_BYTE)
- state->regs[dst_regno].id = 0;
+ clear_scalar_id(&state->regs[dst_regno]);
} else {
int spill_cnt = 0, zero_cnt = 0;
@@ -5624,8 +4167,13 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
}
if (type == STACK_INVALID && env->allow_uninit_stack)
continue;
- verbose(env, "invalid read from stack off %d+%d size %d\n",
- off, i, size);
+ if (type == STACK_POISON) {
+ verbose(env, "reading from stack off %d+%d size %d, slot poisoned by dead code elimination\n",
+ off, i, size);
+ } else {
+ verbose(env, "invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ }
return -EACCES;
}
@@ -5674,8 +4222,13 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
continue;
if (type == STACK_INVALID && env->allow_uninit_stack)
continue;
- verbose(env, "invalid read from stack off %d+%d size %d\n",
- off, i, size);
+ if (type == STACK_POISON) {
+ verbose(env, "reading from stack off %d+%d size %d, slot poisoned by dead code elimination\n",
+ off, i, size);
+ } else {
+ verbose(env, "invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ }
return -EACCES;
}
if (dst_regno >= 0)
@@ -5683,7 +4236,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
insn_flags = 0; /* we are not restoring spilled register */
}
if (insn_flags)
- return push_jmp_history(env, env->cur_state, insn_flags, 0);
+ return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0);
return 0;
}
@@ -5721,7 +4274,7 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env,
{
/* The state of the source register. */
struct bpf_reg_state *reg = reg_state(env, ptr_regno);
- struct bpf_func_state *ptr_state = func(env, reg);
+ struct bpf_func_state *ptr_state = bpf_func(env, reg);
int err;
int min_off, max_off;
@@ -5753,7 +4306,7 @@ static int check_stack_read(struct bpf_verifier_env *env,
int dst_regno)
{
struct bpf_reg_state *reg = reg_state(env, ptr_regno);
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int err;
/* Some accesses are only permitted with a static offset. */
bool var_off = !tnum_is_const(reg->var_off);
@@ -5799,7 +4352,6 @@ static int check_stack_read(struct bpf_verifier_env *env,
* check_stack_write_var_off.
*
* 'ptr_regno' is the register used as a pointer into the stack.
- * 'off' includes 'ptr_regno->off', but not its variable offset (if any).
* 'value_regno' is the register whose value we're writing to the stack. It can
* be -1, meaning that we're not writing from a register.
*
@@ -5810,7 +4362,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
int value_regno, int insn_idx)
{
struct bpf_reg_state *reg = reg_state(env, ptr_regno);
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int err;
if (tnum_is_const(reg->var_off)) {
@@ -5836,14 +4388,14 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
u32 cap = bpf_map_flags_to_cap(map);
if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) {
- verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n",
- map->value_size, off, size);
+ verbose(env, "write into map forbidden, value_size=%d off=%lld size=%d\n",
+ map->value_size, reg->smin_value + off, size);
return -EACCES;
}
if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) {
- verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n",
- map->value_size, off, size);
+ verbose(env, "read from map forbidden, value_size=%d off=%lld size=%d\n",
+ map->value_size, reg->smin_value + off, size);
return -EACCES;
}
@@ -5877,6 +4429,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno,
verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
off, size, regno, reg->id, off, mem_size);
break;
+ case PTR_TO_CTX:
+ verbose(env, "invalid access to context, ctx_size=%d off=%d size=%d\n",
+ mem_size, off, size);
+ break;
case PTR_TO_MEM:
default:
verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n",
@@ -5950,24 +4506,24 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env,
* is only allowed in its original, unmodified form.
*/
- if (reg->off < 0) {
- verbose(env, "negative offset %s ptr R%d off=%d disallowed\n",
- reg_type_str(env, reg->type), regno, reg->off);
+ if (!tnum_is_const(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "variable %s access var_off=%s disallowed\n",
+ reg_type_str(env, reg->type), tn_buf);
return -EACCES;
}
- if (!fixed_off_ok && reg->off) {
- verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
- reg_type_str(env, reg->type), regno, reg->off);
+ if (reg->smin_value < 0) {
+ verbose(env, "negative offset %s ptr R%d off=%lld disallowed\n",
+ reg_type_str(env, reg->type), regno, reg->var_off.value);
return -EACCES;
}
- if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
- char tn_buf[48];
-
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "variable %s access var_off=%s disallowed\n",
- reg_type_str(env, reg->type), tn_buf);
+ if (!fixed_off_ok && reg->var_off.value != 0) {
+ verbose(env, "dereference of modified %s ptr R%d off=%lld disallowed\n",
+ reg_type_str(env, reg->type), regno, reg->var_off.value);
return -EACCES;
}
@@ -6009,14 +4565,14 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
/* For ref_ptr case, release function check should ensure we get one
* referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the
* normal store of unreferenced kptr, we must ensure var_off is zero.
- * Since ref_ptr cannot be accessed directly by BPF insns, checks for
- * reg->off and reg->ref_obj_id are not needed here.
+ * Since ref_ptr cannot be accessed directly by BPF insns, check for
+ * reg->ref_obj_id is not needed here.
*/
if (__check_ptr_off_reg(env, reg, regno, true))
return -EACCES;
/* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and
- * we also need to take into account the reg->off.
+ * we also need to take into account the reg->var_off.
*
* We want to support cases like:
*
@@ -6027,19 +4583,19 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
*
* struct foo *v;
* v = func(); // PTR_TO_BTF_ID
- * val->foo = v; // reg->off is zero, btf and btf_id match type
- * val->bar = &v->br; // reg->off is still zero, but we need to retry with
+ * val->foo = v; // reg->var_off is zero, btf and btf_id match type
+ * val->bar = &v->br; // reg->var_off is still zero, but we need to retry with
* // first member type of struct after comparison fails
- * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked
+ * val->baz = &v->bz; // reg->var_off is non-zero, so struct needs to be walked
* // to match type
*
- * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off
+ * In the kptr_ref case, check_func_arg_reg_off already ensures reg->var_off
* is zero. We must also ensure that btf_struct_ids_match does not walk
* the struct to match type against first member of struct, i.e. reject
* second case from above. Hence, when type is BPF_KPTR_REF, we set
* strict mode to true for type match.
*/
- if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->var_off.value,
kptr_field->kptr.btf, kptr_field->kptr.btf_id,
kptr_field->type != BPF_KPTR_UNREF))
goto bad_type;
@@ -6203,7 +4759,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
return ret;
} else if (class == BPF_STX) {
val_reg = reg_state(env, value_regno);
- if (!register_is_null(val_reg) &&
+ if (!bpf_register_is_null(val_reg) &&
map_kptr_match_type(env, kptr_field, val_reg, value_regno))
return -EACCES;
} else if (class == BPF_ST) {
@@ -6298,11 +4854,9 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
return 0;
}
-#define MAX_PACKET_OFF 0xffff
-
static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
- const struct bpf_call_arg_meta *meta,
- enum bpf_access_type t)
+ const struct bpf_call_arg_meta *meta,
+ enum bpf_access_type t)
{
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
@@ -6348,27 +4902,14 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
struct bpf_reg_state *reg = reg_state(env, regno);
int err;
- /* We may have added a variable offset to the packet pointer; but any
- * reg->range we have comes after that. We are only checking the fixed
- * offset.
- */
-
- /* We don't allow negative numbers, because we aren't tracking enough
- * detail to prove they're safe.
- */
- if (reg->smin_value < 0) {
- verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
- regno);
- return -EACCES;
+ if (reg->range < 0) {
+ verbose(env, "R%d offset is outside of the packet\n", regno);
+ return -EINVAL;
}
- err = reg->range < 0 ? -EINVAL :
- __check_mem_access(env, regno, off, size, reg->range,
- zero_size_allowed);
- if (err) {
- verbose(env, "R%d offset is outside of the packet\n", regno);
+ err = check_mem_region_access(env, regno, off, size, reg->range, zero_size_allowed);
+ if (err)
return err;
- }
/* __check_mem_access has made sure "off + size - 1" is within u16.
* reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff,
@@ -6380,12 +4921,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
max_t(u32, env->prog->aux->max_pkt_offset,
off + reg->umax_value + size - 1);
- return err;
+ return 0;
+}
+
+static bool is_var_ctx_off_allowed(struct bpf_prog *prog)
+{
+ return resolve_prog_type(prog) == BPF_PROG_TYPE_SYSCALL;
}
/* check access to 'struct bpf_context' fields. Supports fixed offsets only */
-static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
- enum bpf_access_type t, struct bpf_insn_access_aux *info)
+static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
+ enum bpf_access_type t, struct bpf_insn_access_aux *info)
{
if (env->ops->is_valid_access &&
env->ops->is_valid_access(off, size, t, env->prog, info)) {
@@ -6416,6 +4962,34 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
return -EACCES;
}
+static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regno,
+ int off, int access_size, enum bpf_access_type t,
+ struct bpf_insn_access_aux *info)
+{
+ /*
+ * Program types that don't rewrite ctx accesses can safely
+ * dereference ctx pointers with fixed offsets.
+ */
+ bool var_off_ok = is_var_ctx_off_allowed(env->prog);
+ bool fixed_off_ok = !env->ops->convert_ctx_access;
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = regs + regno;
+ int err;
+
+ if (var_off_ok)
+ err = check_mem_region_access(env, regno, off, access_size, U16_MAX, false);
+ else
+ err = __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
+ if (err)
+ return err;
+ off += reg->umax_value;
+
+ err = __check_ctx_access(env, insn_idx, off, access_size, t, info);
+ if (err)
+ verbose_linfo(env, insn_idx, "; ");
+ return err;
+}
+
static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
int size)
{
@@ -6597,14 +5171,14 @@ static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
*/
ip_align = 2;
- reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + off));
+ reg_off = tnum_add(reg->var_off, tnum_const(ip_align + off));
if (!tnum_is_aligned(reg_off, size)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose(env,
- "misaligned packet access off %d+%s+%d+%d size %d\n",
- ip_align, tn_buf, reg->off, off, size);
+ "misaligned packet access off %d+%s+%d size %d\n",
+ ip_align, tn_buf, off, size);
return -EACCES;
}
@@ -6622,13 +5196,13 @@ static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
if (!strict || size == 1)
return 0;
- reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off));
+ reg_off = tnum_add(reg->var_off, tnum_const(off));
if (!tnum_is_aligned(reg_off, size)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
- pointer_desc, tn_buf, reg->off, off, size);
+ verbose(env, "misaligned %saccess off %s+%d size %d\n",
+ pointer_desc, tn_buf, off, size);
return -EACCES;
}
@@ -6731,22 +5305,30 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
return round_up(max_t(u32, stack_depth, 1), 32);
}
+/* temporary state used for call frame depth calculation */
+struct bpf_subprog_call_depth_info {
+ int ret_insn; /* caller instruction where we return to. */
+ int caller; /* caller subprogram idx */
+ int frame; /* # of consecutive static call stack frames on top of stack */
+};
+
/* starting from main bpf function walk all instructions of the function
* and recursively walk all callees that given function can call.
* Ignore jump and exit insns.
- * Since recursion is prevented by check_cfg() this algorithm
- * only needs a local stack of MAX_CALL_FRAMES to remember callsites
*/
static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
+ struct bpf_subprog_call_depth_info *dinfo,
bool priv_stack_supported)
{
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
int depth = 0, frame = 0, i, subprog_end, subprog_depth;
bool tail_call_reachable = false;
- int ret_insn[MAX_CALL_FRAMES];
- int ret_prog[MAX_CALL_FRAMES];
- int j;
+ int total;
+ int tmp;
+
+ /* no caller idx */
+ dinfo[idx].caller = -1;
i = subprog[idx].start;
if (!priv_stack_supported)
@@ -6798,8 +5380,12 @@ process_func:
} else {
depth += subprog_depth;
if (depth > MAX_BPF_STACK) {
+ total = 0;
+ for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller)
+ total++;
+
verbose(env, "combined stack size of %d calls is %d. Too large\n",
- frame + 1, depth);
+ total, depth);
return -EACCES;
}
}
@@ -6813,10 +5399,8 @@ continue_func:
if (!is_bpf_throw_kfunc(insn + i))
continue;
- if (subprog[idx].is_cb)
- err = true;
- for (int c = 0; c < frame && !err; c++) {
- if (subprog[ret_prog[c]].is_cb) {
+ for (tmp = idx; tmp >= 0 && !err; tmp = dinfo[tmp].caller) {
+ if (subprog[tmp].is_cb) {
err = true;
break;
}
@@ -6832,12 +5416,10 @@ continue_func:
if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
continue;
/* remember insn and function to return to */
- ret_insn[frame] = i + 1;
- ret_prog[frame] = idx;
/* find the callee */
next_insn = i + insn[i].imm + 1;
- sidx = find_subprog(env, next_insn);
+ sidx = bpf_find_subprog(env, next_insn);
if (verifier_bug_if(sidx < 0, env, "callee not found at insn %d", next_insn))
return -EFAULT;
if (subprog[sidx].is_async_cb) {
@@ -6853,7 +5435,16 @@ continue_func:
return -EINVAL;
}
}
+
+ /* store caller info for after we return from callee */
+ dinfo[idx].frame = frame;
+ dinfo[idx].ret_insn = i + 1;
+
+ /* push caller idx into callee's dinfo */
+ dinfo[sidx].caller = idx;
+
i = next_insn;
+
idx = sidx;
if (!priv_stack_supported)
subprog[idx].priv_stack_mode = NO_PRIV_STACK;
@@ -6861,7 +5452,7 @@ continue_func:
if (subprog[idx].has_tail_call)
tail_call_reachable = true;
- frame++;
+ frame = bpf_subprog_is_global(env, idx) ? 0 : frame + 1;
if (frame >= MAX_CALL_FRAMES) {
verbose(env, "the call stack of %d frames is too deep !\n",
frame);
@@ -6875,12 +5466,12 @@ continue_func:
* tail call counter throughout bpf2bpf calls combined with tailcalls
*/
if (tail_call_reachable)
- for (j = 0; j < frame; j++) {
- if (subprog[ret_prog[j]].is_exception_cb) {
+ for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) {
+ if (subprog[tmp].is_exception_cb) {
verbose(env, "cannot tail call within exception cb\n");
return -EINVAL;
}
- subprog[ret_prog[j]].tail_call_reachable = true;
+ subprog[tmp].tail_call_reachable = true;
}
if (subprog[0].tail_call_reachable)
env->prog->aux->tail_call_reachable = true;
@@ -6888,23 +5479,33 @@ continue_func:
/* end of for() loop means the last insn of the 'subprog'
* was reached. Doesn't matter whether it was JA or EXIT
*/
- if (frame == 0)
+ if (frame == 0 && dinfo[idx].caller < 0)
return 0;
if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE)
depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
- frame--;
- i = ret_insn[frame];
- idx = ret_prog[frame];
+
+ /* pop caller idx from callee */
+ idx = dinfo[idx].caller;
+
+ /* retrieve caller state from its frame */
+ frame = dinfo[idx].frame;
+ i = dinfo[idx].ret_insn;
+
goto continue_func;
}
static int check_max_stack_depth(struct bpf_verifier_env *env)
{
enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN;
+ struct bpf_subprog_call_depth_info *dinfo;
struct bpf_subprog_info *si = env->subprog_info;
bool priv_stack_supported;
int ret;
+ dinfo = kvcalloc(env->subprog_cnt, sizeof(*dinfo), GFP_KERNEL_ACCOUNT);
+ if (!dinfo)
+ return -ENOMEM;
+
for (int i = 0; i < env->subprog_cnt; i++) {
if (si[i].has_tail_call) {
priv_stack_mode = NO_PRIV_STACK;
@@ -6926,9 +5527,12 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
for (int i = env->subprog_cnt - 1; i >= 0; i--) {
if (!i || si[i].is_async_cb) {
priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE;
- ret = check_max_stack_depth_subprog(env, i, priv_stack_supported);
- if (ret < 0)
+ ret = check_max_stack_depth_subprog(env, i, dinfo,
+ priv_stack_supported);
+ if (ret < 0) {
+ kvfree(dinfo);
return ret;
+ }
}
}
@@ -6939,21 +5543,10 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
}
}
- return 0;
-}
+ kvfree(dinfo);
-#ifndef CONFIG_BPF_JIT_ALWAYS_ON
-static int get_callee_stack_depth(struct bpf_verifier_env *env,
- const struct bpf_insn *insn, int idx)
-{
- int start = idx + insn->imm + 1, subprog;
-
- subprog = find_subprog(env, start);
- if (verifier_bug_if(subprog < 0, env, "get stack depth: no program at insn %d", start))
- return -EFAULT;
- return env->subprog_info[subprog].stack_depth;
+ return 0;
}
-#endif
static int __check_buffer_access(struct bpf_verifier_env *env,
const char *buf_info,
@@ -6966,7 +5559,7 @@ static int __check_buffer_access(struct bpf_verifier_env *env,
regno, buf_info, off, size);
return -EACCES;
}
- if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+ if (!tnum_is_const(reg->var_off)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
@@ -6989,8 +5582,8 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env,
if (err)
return err;
- if (off + size > env->prog->aux->max_tp_access)
- env->prog->aux->max_tp_access = off + size;
+ env->prog->aux->max_tp_access = max(reg->var_off.value + off + size,
+ env->prog->aux->max_tp_access);
return 0;
}
@@ -7008,8 +5601,7 @@ static int check_buffer_access(struct bpf_verifier_env *env,
if (err)
return err;
- if (off + size > *max_access)
- *max_access = off + size;
+ *max_access = max(reg->var_off.value + off + size, *max_access);
return 0;
}
@@ -7196,7 +5788,7 @@ out:
set_sext32_default_val(reg, size);
}
-static bool bpf_map_is_rdonly(const struct bpf_map *map)
+bool bpf_map_is_rdonly(const struct bpf_map *map)
{
/* A map is considered read-only if the following condition are true:
*
@@ -7216,8 +5808,8 @@ static bool bpf_map_is_rdonly(const struct bpf_map *map)
!bpf_map_write_active(map);
}
-static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val,
- bool is_ldsx)
+int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val,
+ bool is_ldsx)
{
void *ptr;
u64 addr;
@@ -7402,13 +5994,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
tname);
return -EINVAL;
}
- if (off < 0) {
- verbose(env,
- "R%d is ptr_%s invalid negative access: off=%d\n",
- regno, tname, off);
- return -EACCES;
- }
- if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+
+ if (!tnum_is_const(reg->var_off)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
@@ -7418,6 +6005,15 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
+ off += reg->var_off.value;
+
+ if (off < 0) {
+ verbose(env,
+ "R%d is ptr_%s invalid negative access: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
if (reg->type & MEM_USER) {
verbose(env,
"R%d is ptr_%s access user memory: off=%d\n",
@@ -7629,7 +6225,7 @@ static int check_stack_access_within_bounds(
enum bpf_access_type type)
{
struct bpf_reg_state *reg = reg_state(env, regno);
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
s64 min_off, max_off;
int err;
char *err_extra;
@@ -7664,8 +6260,8 @@ static int check_stack_access_within_bounds(
if (err) {
if (tnum_is_const(reg->var_off)) {
- verbose(env, "invalid%s stack R%d off=%d size=%d\n",
- err_extra, regno, off, access_size);
+ verbose(env, "invalid%s stack R%d off=%lld size=%d\n",
+ err_extra, regno, min_off, access_size);
} else {
char tn_buf[48];
@@ -7693,6 +6289,23 @@ static bool get_func_retval_range(struct bpf_prog *prog,
return false;
}
+static void add_scalar_to_reg(struct bpf_reg_state *dst_reg, s64 val)
+{
+ struct bpf_reg_state fake_reg;
+
+ if (!val)
+ return;
+
+ fake_reg.type = SCALAR_VALUE;
+ __mark_reg_known(&fake_reg, val);
+
+ scalar32_min_max_add(dst_reg, &fake_reg);
+ scalar_min_max_add(dst_reg, &fake_reg);
+ dst_reg->var_off = tnum_add(dst_reg->var_off, fake_reg.var_off);
+
+ reg_bounds_sync(dst_reg);
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -7711,14 +6324,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (size < 0)
return size;
- /* alignment checks will add in reg->off themselves */
err = check_ptr_alignment(env, reg, off, size, strict_alignment_once);
if (err)
return err;
- /* for access checks, reg->off is just part of off */
- off += reg->off;
-
if (reg->type == PTR_TO_MAP_KEY) {
if (t == BPF_WRITE) {
verbose(env, "write to change key R%d not allowed\n", regno);
@@ -7778,6 +6387,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return -EACCES;
}
copy_register_state(&regs[value_regno], reg);
+ add_scalar_to_reg(&regs[value_regno], off);
regs[value_regno].type = PTR_TO_INSN;
} else {
mark_reg_unknown(env, regs, value_regno);
@@ -7815,12 +6425,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
- struct bpf_retval_range range;
struct bpf_insn_access_aux info = {
.reg_type = SCALAR_VALUE,
.is_ldsx = is_ldsx,
.log = &env->log,
};
+ struct bpf_retval_range range;
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
@@ -7828,13 +6438,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return -EACCES;
}
- err = check_ptr_off_reg(env, reg, regno);
- if (err < 0)
- return err;
-
- err = check_ctx_access(env, insn_idx, off, size, t, &info);
- if (err)
- verbose_linfo(env, insn_idx, "; ");
+ err = check_ctx_access(env, insn_idx, regno, off, size, t, &info);
if (!err && t == BPF_READ && value_regno >= 0) {
/* ctx access returns either a scalar, or a
* PTR_TO_PACKET[_META,_END]. In the latter
@@ -8198,8 +6802,6 @@ static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn)
* on the access type and privileges, that all elements of the stack are
* initialized.
*
- * 'off' includes 'regno->off', but not its dynamic part (if any).
- *
* All registers that have been spilled on the stack in the slots within the
* read offsets are marked as read.
*/
@@ -8209,21 +6811,27 @@ static int check_stack_range_initialized(
enum bpf_access_type type, struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *reg = reg_state(env, regno);
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int err, min_off, max_off, i, j, slot, spi;
/* Some accesses can write anything into the stack, others are
* read-only.
*/
- bool clobber = false;
+ bool clobber = type == BPF_WRITE;
+ /*
+ * Negative access_size signals global subprog/kfunc arg check where
+ * STACK_POISON slots are acceptable. static stack liveness
+ * might have determined that subprog doesn't read them,
+ * but BTF based global subprog validation isn't accurate enough.
+ */
+ bool allow_poison = access_size < 0 || clobber;
+
+ access_size = abs(access_size);
if (access_size == 0 && !zero_size_allowed) {
verbose(env, "invalid zero-sized read\n");
return -EACCES;
}
- if (type == BPF_WRITE)
- clobber = true;
-
err = check_stack_access_within_bounds(env, regno, off, access_size, type);
if (err)
return err;
@@ -8275,7 +6883,7 @@ static int check_stack_range_initialized(
for (i = min_off; i < max_off + access_size; i++) {
int stack_off = -i - 1;
- spi = __get_spi(i);
+ spi = bpf_get_spi(i);
/* raw_mode may write past allocated_stack */
if (state->allocated_stack <= stack_off)
continue;
@@ -8311,7 +6919,7 @@ static int check_stack_range_initialized(
goto mark;
}
- if (is_spilled_reg(&state->stack[spi]) &&
+ if (bpf_is_spilled_reg(&state->stack[spi]) &&
(state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
env->allow_ptr_leaks)) {
if (clobber) {
@@ -8322,7 +6930,12 @@ static int check_stack_range_initialized(
goto mark;
}
- if (tnum_is_const(reg->var_off)) {
+ if (*stype == STACK_POISON) {
+ if (allow_poison)
+ goto mark;
+ verbose(env, "reading from stack R%d off %d+%d size %d, slot poisoned by dead code elimination\n",
+ regno, min_off, i - min_off, access_size);
+ } else if (tnum_is_const(reg->var_off)) {
verbose(env, "invalid read from stack R%d off %d+%d size %d\n",
regno, min_off, i - min_off, access_size);
} else {
@@ -8334,17 +6947,7 @@ static int check_stack_range_initialized(
}
return -EACCES;
mark:
- /* reading any byte out of 8-byte 'spill_slot' will cause
- * the whole slot to be marked as 'read'
- */
- err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi));
- if (err)
- return err;
- /* We do not call bpf_mark_stack_write(), as we can not
- * be sure that whether stack slot is written to or not. Hence,
- * we must still conservatively propagate reads upwards even if
- * helper may write to the entire memory range.
- */
+ ;
}
return 0;
}
@@ -8360,7 +6963,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
switch (base_type(reg->type)) {
case PTR_TO_PACKET:
case PTR_TO_PACKET_META:
- return check_packet_access(env, regno, reg->off, access_size,
+ return check_packet_access(env, regno, 0, access_size,
zero_size_allowed);
case PTR_TO_MAP_KEY:
if (access_type == BPF_WRITE) {
@@ -8368,12 +6971,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
reg_type_str(env, reg->type));
return -EACCES;
}
- return check_mem_region_access(env, regno, reg->off, access_size,
+ return check_mem_region_access(env, regno, 0, access_size,
reg->map_ptr->key_size, false);
case PTR_TO_MAP_VALUE:
- if (check_map_access_type(env, regno, reg->off, access_size, access_type))
+ if (check_map_access_type(env, regno, 0, access_size, access_type))
return -EACCES;
- return check_map_access(env, regno, reg->off, access_size,
+ return check_map_access(env, regno, 0, access_size,
zero_size_allowed, ACCESS_HELPER);
case PTR_TO_MEM:
if (type_is_rdonly_mem(reg->type)) {
@@ -8383,7 +6986,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return -EACCES;
}
}
- return check_mem_region_access(env, regno, reg->off,
+ return check_mem_region_access(env, regno, 0,
access_size, reg->mem_size,
zero_size_allowed);
case PTR_TO_BUF:
@@ -8398,39 +7001,33 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
} else {
max_access = &env->prog->aux->max_rdwr_access;
}
- return check_buffer_access(env, reg, regno, reg->off,
+ return check_buffer_access(env, reg, regno, 0,
access_size, zero_size_allowed,
max_access);
case PTR_TO_STACK:
return check_stack_range_initialized(
env,
- regno, reg->off, access_size,
+ regno, 0, access_size,
zero_size_allowed, access_type, meta);
case PTR_TO_BTF_ID:
- return check_ptr_to_btf_access(env, regs, regno, reg->off,
+ return check_ptr_to_btf_access(env, regs, regno, 0,
access_size, BPF_READ, -1);
case PTR_TO_CTX:
- /* in case the function doesn't know how to access the context,
- * (because we are in a program of type SYSCALL for example), we
- * can not statically check its size.
- * Dynamically check it now.
- */
- if (!env->ops->convert_ctx_access) {
- int offset = access_size - 1;
-
- /* Allow zero-byte read from PTR_TO_CTX */
- if (access_size == 0)
- return zero_size_allowed ? 0 : -EACCES;
-
- return check_mem_access(env, env->insn_idx, regno, offset, BPF_B,
- access_type, -1, false, false);
+ /* Only permit reading or writing syscall context using helper calls. */
+ if (is_var_ctx_off_allowed(env->prog)) {
+ int err = check_mem_region_access(env, regno, 0, access_size, U16_MAX,
+ zero_size_allowed);
+ if (err)
+ return err;
+ if (env->prog->aux->max_ctx_offset < reg->umax_value + access_size)
+ env->prog->aux->max_ctx_offset = reg->umax_value + access_size;
+ return 0;
}
-
fallthrough;
default: /* scalar_value or invalid ptr */
/* Allow zero-byte read from NULL, regardless of pointer type */
if (zero_size_allowed && access_size == 0 &&
- register_is_null(reg))
+ bpf_register_is_null(reg))
return 0;
verbose(env, "R%d type=%s ", regno,
@@ -8503,7 +7100,7 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg
struct bpf_reg_state saved_reg;
int err;
- if (register_is_null(reg))
+ if (bpf_register_is_null(reg))
return 0;
/* Assuming that the register contains a value check if the memory
@@ -8515,8 +7112,10 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg
mark_ptr_not_null_reg(reg);
}
- err = check_helper_mem_access(env, regno, mem_size, BPF_READ, true, NULL);
- err = err ?: check_helper_mem_access(env, regno, mem_size, BPF_WRITE, true, NULL);
+ int size = base_type(reg->type) == PTR_TO_STACK ? -(int)mem_size : mem_size;
+
+ err = check_helper_mem_access(env, regno, size, BPF_READ, true, NULL);
+ err = err ?: check_helper_mem_access(env, regno, size, BPF_WRITE, true, NULL);
if (may_be_null)
*reg = saved_reg;
@@ -8619,9 +7218,9 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
return -EINVAL;
}
spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off;
- if (spin_lock_off != val + reg->off) {
+ if (spin_lock_off != val) {
verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n",
- val + reg->off, lock_str, spin_lock_off);
+ val, lock_str, spin_lock_off);
return -EINVAL;
}
if (is_lock) {
@@ -8736,9 +7335,9 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
verifier_bug(env, "unsupported BTF field type: %s\n", struct_name);
return -EINVAL;
}
- if (field_off != val + reg->off) {
+ if (field_off != val) {
verbose(env, "off %lld doesn't point to 'struct %s' that is at %d\n",
- val + reg->off, struct_name, field_off);
+ val, struct_name, field_off);
return -EINVAL;
}
if (map_desc->ptr) {
@@ -8806,7 +7405,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
return -EINVAL;
}
- kptr_off = reg->off + reg->var_off.value;
+ kptr_off = reg->var_off.value;
kptr_field = btf_record_find(rec, kptr_off, BPF_KPTR);
if (!kptr_field) {
verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
@@ -8927,7 +7526,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
return state->stack[spi].spilled_ptr.ref_obj_id;
}
@@ -8942,10 +7541,6 @@ static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta)
return meta->kfunc_flags & KF_ITER_NEW;
}
-static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta)
-{
- return meta->kfunc_flags & KF_ITER_NEXT;
-}
static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -9063,7 +7658,7 @@ static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
struct list_head *pos, *head;
/* Explored states are pushed in stack order, most recent states come first */
- head = explored_state(env, insn_idx);
+ head = bpf_explored_state(env, insn_idx);
list_for_each(pos, head) {
sl = container_of(pos, struct bpf_verifier_state_list, node);
/* If st->branches != 0 state is a part of current DFS verification path,
@@ -9078,11 +7673,6 @@ static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
return NULL;
}
-static void reset_idmap_scratch(struct bpf_verifier_env *env);
-static bool regs_exact(const struct bpf_reg_state *rold,
- const struct bpf_reg_state *rcur,
- struct bpf_idmap *idmap);
-
/*
* Check if scalar registers are exact for the purpose of not widening.
* More lenient than regs_exact()
@@ -9124,8 +7714,8 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env,
num_slots = min(fold->allocated_stack / BPF_REG_SIZE,
fcur->allocated_stack / BPF_REG_SIZE);
for (i = 0; i < num_slots; i++) {
- if (!is_spilled_reg(&fold->stack[i]) ||
- !is_spilled_reg(&fcur->stack[i]))
+ if (!bpf_is_spilled_reg(&fold->stack[i]) ||
+ !bpf_is_spilled_reg(&fcur->stack[i]))
continue;
maybe_widen_reg(env,
@@ -9369,6 +7959,7 @@ static const struct bpf_reg_types mem_types = {
PTR_TO_MEM | MEM_RINGBUF,
PTR_TO_BUF,
PTR_TO_BTF_ID | PTR_TRUSTED,
+ PTR_TO_CTX,
},
};
@@ -9405,7 +7996,9 @@ static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE }
static const struct bpf_reg_types kptr_xchg_dest_types = {
.types = {
PTR_TO_MAP_VALUE,
- PTR_TO_BTF_ID | MEM_ALLOC
+ PTR_TO_BTF_ID | MEM_ALLOC,
+ PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF,
+ PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU,
}
};
static const struct bpf_reg_types dynptr_types = {
@@ -9449,7 +8042,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
struct bpf_reg_state *reg = reg_state(env, regno);
enum bpf_reg_type expected, type = reg->type;
const struct bpf_reg_types *compatible;
- int i, j;
+ int i, j, err;
compatible = compatible_reg_types[base_type(arg_type)];
if (!compatible) {
@@ -9552,8 +8145,12 @@ found:
return -EACCES;
}
- if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
- btf_vmlinux, *arg_btf_id,
+ err = __check_ptr_off_reg(env, reg, regno, true);
+ if (err)
+ return err;
+
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id,
+ reg->var_off.value, btf_vmlinux, *arg_btf_id,
strict_type_match)) {
verbose(env, "R%d is of type %s but %s is expected\n",
regno, btf_type_name(reg->btf, reg->btf_id),
@@ -9565,6 +8162,8 @@ found:
}
case PTR_TO_BTF_ID | MEM_ALLOC:
case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC:
+ case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
+ case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
meta->func_id != BPF_FUNC_kptr_xchg) {
verifier_bug(env, "unimplemented handling of MEM_ALLOC");
@@ -9631,12 +8230,11 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env,
* because fixed_off_ok is false, but checking here allows us
* to give the user a better error message.
*/
- if (reg->off) {
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value != 0) {
verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n",
regno);
return -EINVAL;
}
- return __check_ptr_off_reg(env, reg, regno, false);
}
switch (type) {
@@ -9671,6 +8269,16 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env,
* still need to do checks instead of returning.
*/
return __check_ptr_off_reg(env, reg, regno, true);
+ case PTR_TO_CTX:
+ /*
+ * Allow fixed and variable offsets for syscall context, but
+ * only when the argument is passed as memory, not ctx,
+ * otherwise we may get modified ctx in tail called programs and
+ * global subprogs (that may act as extension prog hooks).
+ */
+ if (arg_type != ARG_PTR_TO_CTX && is_var_ctx_off_allowed(env->prog))
+ return 0;
+ fallthrough;
default:
return __check_ptr_off_reg(env, reg, regno, false);
}
@@ -9700,7 +8308,7 @@ static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env,
static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int spi;
if (reg->type == CONST_PTR_TO_DYNPTR)
@@ -9713,7 +8321,7 @@ static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int spi;
if (reg->type == CONST_PTR_TO_DYNPTR)
@@ -9727,13 +8335,13 @@ static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state
static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env,
struct bpf_reg_state *reg)
{
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int spi;
if (reg->type == CONST_PTR_TO_DYNPTR)
return reg->dynptr.type;
- spi = __get_spi(reg->off);
+ spi = bpf_get_spi(reg->var_off.value);
if (spi < 0) {
verbose(env, "verifier internal error: invalid spi when querying dynptr type\n");
return BPF_DYNPTR_TYPE_INVALID;
@@ -9774,13 +8382,13 @@ static int check_reg_const_str(struct bpf_verifier_env *env,
return -EACCES;
}
- err = check_map_access(env, regno, reg->off,
- map->value_size - reg->off, false,
+ err = check_map_access(env, regno, 0,
+ map->value_size - reg->var_off.value, false,
ACCESS_HELPER);
if (err)
return err;
- map_off = reg->off + reg->var_off.value;
+ map_off = reg->var_off.value;
err = map->ops->map_direct_value_addr(map, &map_addr, map_off);
if (err) {
verbose(env, "direct value access on string failed\n");
@@ -9801,7 +8409,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env,
u32 key_size,
s64 *value)
{
- struct bpf_func_state *state = func(env, key);
+ struct bpf_func_state *state = bpf_func(env, key);
struct bpf_reg_state *reg;
int slot, spi, off;
int spill_size = 0;
@@ -9817,7 +8425,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env,
if (!tnum_is_const(key->var_off))
return -EOPNOTSUPP;
- stack_off = key->off + key->var_off.value;
+ stack_off = key->var_off.value;
slot = -stack_off - 1;
spi = slot / BPF_REG_SIZE;
off = slot % BPF_REG_SIZE;
@@ -9832,7 +8440,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env,
}
/* Check that stack contains a scalar spill of expected size */
- if (!is_spilled_scalar_reg(&state->stack[spi]))
+ if (!bpf_is_spilled_scalar_reg(&state->stack[spi]))
return -EOPNOTSUPP;
for (i = off; i >= 0 && stype[i] == STACK_SPILL; i--)
spill_size++;
@@ -9847,7 +8455,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env,
/* We are relying on a constant value. So mark as precise
* to prevent pruning on it.
*/
- bt_set_frame_slot(&env->bt, key->frameno, spi);
+ bpf_bt_set_frame_slot(&env->bt, key->frameno, spi);
err = mark_chain_precision_batch(env, env->cur_state);
if (err < 0)
return err;
@@ -9899,7 +8507,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
return err;
}
- if (register_is_null(reg) && type_may_be_null(arg_type))
+ if (bpf_register_is_null(reg) && type_may_be_null(arg_type))
/* A NULL register has a SCALAR_VALUE type, so skip
* type checking.
*/
@@ -9921,7 +8529,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
skip_type_check:
if (arg_type_is_release(arg_type)) {
if (arg_type_is_dynptr(arg_type)) {
- struct bpf_func_state *state = func(env, reg);
+ struct bpf_func_state *state = bpf_func(env, reg);
int spi;
/* Only dynptr created on stack can be released, thus
@@ -9939,7 +8547,7 @@ skip_type_check:
verbose(env, "cannot release unowned const bpf_dynptr\n");
return -EINVAL;
}
- } else if (!reg->ref_obj_id && !register_is_null(reg)) {
+ } else if (!reg->ref_obj_id && !bpf_register_is_null(reg)) {
verbose(env, "R%d must be referenced when passed to release function\n",
regno);
return -EINVAL;
@@ -10018,7 +8626,7 @@ skip_type_check:
}
break;
case ARG_PTR_TO_MAP_VALUE:
- if (type_may_be_null(arg_type) && register_is_null(reg))
+ if (type_may_be_null(arg_type) && bpf_register_is_null(reg))
return 0;
/* bpf_map_xxx(..., map_ptr, ..., value) call:
@@ -10166,7 +8774,7 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
return false;
}
-static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env)
+bool bpf_allow_tail_call_in_subprogs(struct bpf_verifier_env *env)
{
return env->prog->jit_requested &&
bpf_jit_supports_subprog_tailcalls();
@@ -10311,7 +8919,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_FUNC_tail_call:
if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
goto error;
- if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) {
+ if (env->subprog_cnt > 1 && !bpf_allow_tail_call_in_subprogs(env)) {
verbose(env, "mixing of tail_calls and bpf-to-bpf calls is not supported\n");
return -EINVAL;
}
@@ -10623,7 +9231,7 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env,
/* after the call registers r0 - r5 were scratched */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(env, regs, caller_saved[i]);
+ bpf_mark_reg_not_init(env, &regs[caller_saved[i]]);
__check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK);
}
}
@@ -10718,7 +9326,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
* invalid memory access.
*/
} else if (arg->arg_type == ARG_PTR_TO_CTX) {
- ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_CTX);
if (ret < 0)
return ret;
/* If function expects ctx type in BTF check that caller
@@ -10762,7 +9370,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
struct bpf_call_arg_meta meta;
int err;
- if (register_is_null(reg) && type_may_be_null(arg->arg_type))
+ if (bpf_register_is_null(reg) && type_may_be_null(arg->arg_type))
continue;
memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */
@@ -10844,7 +9452,7 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
return -EFAULT;
}
- if (is_async_callback_calling_insn(insn)) {
+ if (bpf_is_async_callback_calling_insn(insn)) {
struct bpf_verifier_state *async_cb;
/* there is no real recursion here. timer and workqueue callbacks are async */
@@ -10891,7 +9499,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int err, subprog, target_insn;
target_insn = *insn_idx + insn->imm + 1;
- subprog = find_subprog(env, target_insn);
+ subprog = bpf_find_subprog(env, target_insn);
if (verifier_bug_if(subprog < 0, env, "target of func call at insn %d is not a program",
target_insn))
return -EFAULT;
@@ -10900,7 +9508,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
err = btf_check_subprog_call(env, subprog, caller->regs);
if (err == -EFAULT)
return err;
- if (subprog_is_global(env, subprog)) {
+ if (bpf_subprog_is_global(env, subprog)) {
const char *sub_name = subprog_name(env, subprog);
if (env->cur_state->active_locks) {
@@ -10909,12 +9517,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EINVAL;
}
- if (env->subprog_info[subprog].might_sleep &&
- (env->cur_state->active_rcu_locks || env->cur_state->active_preempt_locks ||
- env->cur_state->active_irq_id || !in_sleepable(env))) {
- verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n"
- "i.e., in a RCU/IRQ/preempt-disabled section, or in\n"
- "a non-sleepable BPF program context\n");
+ if (env->subprog_info[subprog].might_sleep && !in_sleepable_context(env)) {
+ verbose(env, "sleepable global function %s() called in %s\n",
+ sub_name, non_sleepable_context_description(env));
return -EINVAL;
}
@@ -10933,9 +9538,11 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
subprog_aux(env, subprog)->called = true;
clear_caller_saved_regs(env, caller->regs);
- /* All global functions return a 64-bit SCALAR_VALUE */
- mark_reg_unknown(env, caller->regs, BPF_REG_0);
- caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+ /* All non-void global functions return a 64-bit SCALAR_VALUE. */
+ if (!subprog_returns_void(env, subprog)) {
+ mark_reg_unknown(env, caller->regs, BPF_REG_0);
+ caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+ }
/* continue with next insn after call */
return 0;
@@ -10953,8 +9560,6 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* and go analyze first insn of the callee */
*insn_idx = env->subprog_info[subprog].start - 1;
- bpf_reset_live_stack_callchain(env);
-
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "caller:\n");
print_verifier_state(env, state, caller->frameno, true);
@@ -10988,7 +9593,7 @@ int map_set_for_each_callback_args(struct bpf_verifier_env *env,
callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3];
/* unused */
- __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
return 0;
}
@@ -11045,9 +9650,9 @@ static int set_loop_callback_state(struct bpf_verifier_env *env,
callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
/* unused */
- __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
callee->callback_ret_range = retval_range(0, 1);
@@ -11077,8 +9682,8 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
callee->regs[BPF_REG_3].map_ptr = map_ptr;
/* unused */
- __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_async_callback_fn = true;
callee->callback_ret_range = retval_range(0, 0);
return 0;
@@ -11105,8 +9710,8 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env,
callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4];
/* unused */
- __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
callee->callback_ret_range = retval_range(0, 1);
return 0;
@@ -11121,14 +9726,14 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
* callback_ctx, u64 flags);
* callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);
*/
- __mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
mark_dynptr_cb_reg(env, &callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
/* unused */
- __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
callee->callback_ret_range = retval_range(0, 1);
@@ -11149,7 +9754,8 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
*/
struct btf_field *field;
- field = reg_find_field_offset(&caller->regs[BPF_REG_1], caller->regs[BPF_REG_1].off,
+ field = reg_find_field_offset(&caller->regs[BPF_REG_1],
+ caller->regs[BPF_REG_1].var_off.value,
BPF_RB_ROOT);
if (!field || !field->graph_root.value_btf_id)
return -EFAULT;
@@ -11159,9 +9765,9 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
mark_reg_graph_node(callee->regs, BPF_REG_2, &field->graph_root);
ref_set_non_owning(env, &callee->regs[BPF_REG_2]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
callee->callback_ret_range = retval_range(0, 1);
return 0;
@@ -11190,8 +9796,8 @@ static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env,
callee->regs[BPF_REG_3].map_ptr = map_ptr;
/* unused */
- __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
- __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_async_callback_fn = true;
callee->callback_ret_range = retval_range(S32_MIN, S32_MAX);
return 0;
@@ -11222,10 +9828,9 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
return is_rbtree_lock_required_kfunc(kfunc_btf_id);
}
-static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg,
- bool return_32bit)
+static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg)
{
- if (return_32bit)
+ if (range.return_32bit)
return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval;
else
return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
@@ -11239,10 +9844,6 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
bool in_callback_fn;
int err;
- err = bpf_update_live_stack(env);
- if (err)
- return err;
-
callee = state->frame[state->curframe];
r0 = &callee->regs[BPF_REG_0];
if (r0->type == PTR_TO_STACK) {
@@ -11269,7 +9870,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return err;
/* enforce R0 return value range, and bpf_callback_t returns 64bit */
- if (!retval_range_within(callee->callback_ret_range, r0, false)) {
+ if (!retval_range_within(callee->callback_ret_range, r0)) {
verbose_invalid_scalar(env, r0, callee->callback_ret_range,
"At callback return", "R0");
return -EINVAL;
@@ -11525,7 +10126,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
/* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const
* and map_direct_value_addr is set.
*/
- fmt_map_off = fmt_reg->off + fmt_reg->var_off.value;
+ fmt_map_off = fmt_reg->var_off.value;
err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr,
fmt_map_off);
if (err) {
@@ -11551,7 +10152,7 @@ static int check_get_func_ip(struct bpf_verifier_env *env)
if (type == BPF_PROG_TYPE_TRACING) {
if (!bpf_prog_has_trampoline(env->prog)) {
- verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n",
+ verbose(env, "func %s#%d supported only for fentry/fexit/fsession/fmod_ret programs\n",
func_id_name(func_id), func_id);
return -ENOTSUPP;
}
@@ -11573,7 +10174,7 @@ static struct bpf_insn_aux_data *cur_aux(const struct bpf_verifier_env *env)
static bool loop_flag_is_zero(struct bpf_verifier_env *env)
{
struct bpf_reg_state *reg = reg_state(env, BPF_REG_4);
- bool reg_is_null = register_is_null(reg);
+ bool reg_is_null = bpf_register_is_null(reg);
if (reg_is_null)
mark_chain_precision(env, BPF_REG_4);
@@ -11614,8 +10215,8 @@ static bool can_elide_value_nullness(enum bpf_map_type type)
}
}
-static int get_helper_proto(struct bpf_verifier_env *env, int func_id,
- const struct bpf_func_proto **ptr)
+int bpf_get_helper_proto(struct bpf_verifier_env *env, int func_id,
+ const struct bpf_func_proto **ptr)
{
if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID)
return -ERANGE;
@@ -11637,6 +10238,19 @@ static inline bool in_sleepable_context(struct bpf_verifier_env *env)
in_sleepable(env);
}
+static const char *non_sleepable_context_description(struct bpf_verifier_env *env)
+{
+ if (env->cur_state->active_rcu_locks)
+ return "rcu_read_lock region";
+ if (env->cur_state->active_preempt_locks)
+ return "non-preemptible region";
+ if (env->cur_state->active_irq_id)
+ return "IRQ-disabled region";
+ if (env->cur_state->active_locks)
+ return "lock region";
+ return "non-sleepable prog";
+}
+
static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
@@ -11653,7 +10267,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* find function prototype */
func_id = insn->imm;
- err = get_helper_proto(env, insn->imm, &fn);
+ err = bpf_get_helper_proto(env, insn->imm, &fn);
if (err == -ERANGE) {
verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id);
return -EINVAL;
@@ -11676,11 +10290,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
- if (!in_sleepable(env) && fn->might_sleep) {
- verbose(env, "helper call might sleep in a non-sleepable prog\n");
- return -EINVAL;
- }
-
/* With LD_ABS/IND some JITs save/restore skb from r1. */
changes_data = bpf_helper_changes_pkt_data(func_id);
if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
@@ -11697,28 +10306,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return err;
}
- if (env->cur_state->active_rcu_locks) {
- if (fn->might_sleep) {
- verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n",
- func_id_name(func_id), func_id);
- return -EINVAL;
- }
- }
-
- if (env->cur_state->active_preempt_locks) {
- if (fn->might_sleep) {
- verbose(env, "sleepable helper %s#%d in non-preemptible region\n",
- func_id_name(func_id), func_id);
- return -EINVAL;
- }
- }
-
- if (env->cur_state->active_irq_id) {
- if (fn->might_sleep) {
- verbose(env, "sleepable helper %s#%d in IRQ-disabled region\n",
- func_id_name(func_id), func_id);
- return -EINVAL;
- }
+ if (fn->might_sleep && !in_sleepable_context(env)) {
+ verbose(env, "sleepable helper %s#%d in %s\n", func_id_name(func_id), func_id,
+ non_sleepable_context_description(env));
+ return -EINVAL;
}
/* Track non-sleepable context for helpers. */
@@ -11779,7 +10370,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
} else if (meta.ref_obj_id) {
err = release_reference(env, meta.ref_obj_id);
- } else if (register_is_null(&regs[meta.release_regno])) {
+ } else if (bpf_register_is_null(&regs[meta.release_regno])) {
/* meta.ref_obj_id can only be 0 if register that is meant to be
* released is NULL, which must be > R0.
*/
@@ -11802,7 +10393,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* check that flags argument in get_local_storage(map, flags) is 0,
* this is required because get_local_storage() can't return an error.
*/
- if (!register_is_null(&regs[BPF_REG_2])) {
+ if (!bpf_register_is_null(&regs[BPF_REG_2])) {
verbose(env, "get_local_storage() doesn't support non-zero flags\n");
return -EINVAL;
}
@@ -11945,7 +10536,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(env, regs, caller_saved[i]);
+ bpf_mark_reg_not_init(env, &regs[caller_saved[i]]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
@@ -12207,10 +10798,6 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
return meta->kfunc_flags & KF_RELEASE;
}
-static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
-{
- return meta->kfunc_flags & KF_SLEEPABLE;
-}
static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -12431,6 +11018,28 @@ static bool is_kfunc_arg_prog_aux(const struct btf *btf, const struct btf_param
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_PROG_AUX_ID);
}
+/*
+ * A kfunc with KF_IMPLICIT_ARGS has two prototypes in BTF:
+ * - the _impl prototype with full arg list (meta->func_proto)
+ * - the BPF API prototype w/o implicit args (func->type in BTF)
+ * To determine whether an argument is implicit, we compare its position
+ * against the number of arguments in the prototype w/o implicit args.
+ */
+static bool is_kfunc_arg_implicit(const struct bpf_kfunc_call_arg_meta *meta, u32 arg_idx)
+{
+ const struct btf_type *func, *func_proto;
+ u32 argn;
+
+ if (!(meta->kfunc_flags & KF_IMPLICIT_ARGS))
+ return false;
+
+ func = btf_type_by_id(meta->btf, meta->func_id);
+ func_proto = btf_type_by_id(meta->btf, func->type);
+ argn = btf_type_vlen(func_proto);
+
+ return argn <= arg_idx;
+}
+
/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,
const struct btf *btf,
@@ -12497,10 +11106,15 @@ enum kfunc_ptr_arg_type {
enum special_kfunc_type {
KF_bpf_obj_new_impl,
+ KF_bpf_obj_new,
KF_bpf_obj_drop_impl,
+ KF_bpf_obj_drop,
KF_bpf_refcount_acquire_impl,
+ KF_bpf_refcount_acquire,
KF_bpf_list_push_front_impl,
+ KF_bpf_list_push_front,
KF_bpf_list_push_back_impl,
+ KF_bpf_list_push_back,
KF_bpf_list_pop_front,
KF_bpf_list_pop_back,
KF_bpf_list_front,
@@ -12511,6 +11125,7 @@ enum special_kfunc_type {
KF_bpf_rcu_read_unlock,
KF_bpf_rbtree_remove,
KF_bpf_rbtree_add_impl,
+ KF_bpf_rbtree_add,
KF_bpf_rbtree_first,
KF_bpf_rbtree_root,
KF_bpf_rbtree_left,
@@ -12523,7 +11138,9 @@ enum special_kfunc_type {
KF_bpf_dynptr_slice_rdwr,
KF_bpf_dynptr_clone,
KF_bpf_percpu_obj_new_impl,
+ KF_bpf_percpu_obj_new,
KF_bpf_percpu_obj_drop_impl,
+ KF_bpf_percpu_obj_drop,
KF_bpf_throw,
KF_bpf_wq_set_callback,
KF_bpf_preempt_disable,
@@ -12557,10 +11174,15 @@ enum special_kfunc_type {
BTF_ID_LIST(special_kfunc_list)
BTF_ID(func, bpf_obj_new_impl)
+BTF_ID(func, bpf_obj_new)
BTF_ID(func, bpf_obj_drop_impl)
+BTF_ID(func, bpf_obj_drop)
BTF_ID(func, bpf_refcount_acquire_impl)
+BTF_ID(func, bpf_refcount_acquire)
BTF_ID(func, bpf_list_push_front_impl)
+BTF_ID(func, bpf_list_push_front)
BTF_ID(func, bpf_list_push_back_impl)
+BTF_ID(func, bpf_list_push_back)
BTF_ID(func, bpf_list_pop_front)
BTF_ID(func, bpf_list_pop_back)
BTF_ID(func, bpf_list_front)
@@ -12571,6 +11193,7 @@ BTF_ID(func, bpf_rcu_read_lock)
BTF_ID(func, bpf_rcu_read_unlock)
BTF_ID(func, bpf_rbtree_remove)
BTF_ID(func, bpf_rbtree_add_impl)
+BTF_ID(func, bpf_rbtree_add)
BTF_ID(func, bpf_rbtree_first)
BTF_ID(func, bpf_rbtree_root)
BTF_ID(func, bpf_rbtree_left)
@@ -12590,7 +11213,9 @@ BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
BTF_ID(func, bpf_percpu_obj_new_impl)
+BTF_ID(func, bpf_percpu_obj_new)
BTF_ID(func, bpf_percpu_obj_drop_impl)
+BTF_ID(func, bpf_percpu_obj_drop)
BTF_ID(func, bpf_throw)
BTF_ID(func, bpf_wq_set_callback)
BTF_ID(func, bpf_preempt_disable)
@@ -12634,6 +11259,50 @@ BTF_ID(func, bpf_session_is_return)
BTF_ID(func, bpf_stream_vprintk)
BTF_ID(func, bpf_stream_print_stack)
+static bool is_bpf_obj_new_kfunc(u32 func_id)
+{
+ return func_id == special_kfunc_list[KF_bpf_obj_new] ||
+ func_id == special_kfunc_list[KF_bpf_obj_new_impl];
+}
+
+static bool is_bpf_percpu_obj_new_kfunc(u32 func_id)
+{
+ return func_id == special_kfunc_list[KF_bpf_percpu_obj_new] ||
+ func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl];
+}
+
+static bool is_bpf_obj_drop_kfunc(u32 func_id)
+{
+ return func_id == special_kfunc_list[KF_bpf_obj_drop] ||
+ func_id == special_kfunc_list[KF_bpf_obj_drop_impl];
+}
+
+static bool is_bpf_percpu_obj_drop_kfunc(u32 func_id)
+{
+ return func_id == special_kfunc_list[KF_bpf_percpu_obj_drop] ||
+ func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl];
+}
+
+static bool is_bpf_refcount_acquire_kfunc(u32 func_id)
+{
+ return func_id == special_kfunc_list[KF_bpf_refcount_acquire] ||
+ func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
+}
+
+static bool is_bpf_list_push_kfunc(u32 func_id)
+{
+ return func_id == special_kfunc_list[KF_bpf_list_push_front] ||
+ func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ func_id == special_kfunc_list[KF_bpf_list_push_back] ||
+ func_id == special_kfunc_list[KF_bpf_list_push_back_impl];
+}
+
+static bool is_bpf_rbtree_add_kfunc(u32 func_id)
+{
+ return func_id == special_kfunc_list[KF_bpf_rbtree_add] ||
+ func_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
+}
+
static bool is_task_work_add_kfunc(u32 func_id)
{
return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] ||
@@ -12642,10 +11311,8 @@ static bool is_task_work_add_kfunc(u32 func_id)
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
- if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
- meta->arg_owning_ref) {
+ if (is_bpf_refcount_acquire_kfunc(meta->func_id) && meta->arg_owning_ref)
return false;
- }
return meta->kfunc_flags & KF_RET_NULL;
}
@@ -12670,7 +11337,7 @@ static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta)
return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable];
}
-static bool is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta)
+bool bpf_is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta)
{
return meta->func_id == special_kfunc_list[KF_bpf_xdp_pull_data];
}
@@ -12705,7 +11372,7 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
return KF_ARG_PTR_TO_CTX;
- if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg) &&
+ if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && bpf_register_is_null(reg) &&
!arg_mem_size)
return KF_ARG_PTR_TO_NULL;
@@ -12831,13 +11498,12 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
strict_type_match = true;
- WARN_ON_ONCE(is_kfunc_release(meta) &&
- (reg->off || !tnum_is_const(reg->var_off) ||
- reg->var_off.value));
+ WARN_ON_ONCE(is_kfunc_release(meta) && !tnum_is_const(reg->var_off));
reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
- struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match);
+ struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->var_off.value,
+ meta->btf, ref_id, strict_type_match);
/* If kfunc is accepting a projection type (ie. __sk_buff), it cannot
* actually use it -- it must cast to the underlying type. So we allow
* caller to pass in the underlying type.
@@ -13034,8 +11700,7 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_
static bool is_bpf_list_api_kfunc(u32 btf_id)
{
- return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
- btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ return is_bpf_list_push_kfunc(btf_id) ||
btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
btf_id == special_kfunc_list[KF_bpf_list_pop_back] ||
btf_id == special_kfunc_list[KF_bpf_list_front] ||
@@ -13044,7 +11709,7 @@ static bool is_bpf_list_api_kfunc(u32 btf_id)
static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
{
- return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
+ return is_bpf_rbtree_add_kfunc(btf_id) ||
btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
btf_id == special_kfunc_list[KF_bpf_rbtree_first] ||
btf_id == special_kfunc_list[KF_bpf_rbtree_root] ||
@@ -13061,8 +11726,9 @@ static bool is_bpf_iter_num_api_kfunc(u32 btf_id)
static bool is_bpf_graph_api_kfunc(u32 btf_id)
{
- return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id) ||
- btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
+ return is_bpf_list_api_kfunc(btf_id) ||
+ is_bpf_rbtree_api_kfunc(btf_id) ||
+ is_bpf_refcount_acquire_kfunc(btf_id);
}
static bool is_bpf_res_spin_lock_kfunc(u32 btf_id)
@@ -13095,7 +11761,7 @@ static bool kfunc_spin_allowed(u32 btf_id)
static bool is_sync_callback_calling_kfunc(u32 btf_id)
{
- return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
+ return is_bpf_rbtree_add_kfunc(btf_id);
}
static bool is_async_callback_calling_kfunc(u32 btf_id)
@@ -13159,12 +11825,11 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
switch (node_field_type) {
case BPF_LIST_NODE:
- ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
- kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back_impl]);
+ ret = is_bpf_list_push_kfunc(kfunc_btf_id);
break;
case BPF_RB_NODE:
- ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
- kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
+ ret = (is_bpf_rbtree_add_kfunc(kfunc_btf_id) ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_left] ||
kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_right]);
break;
@@ -13209,7 +11874,7 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
}
rec = reg_btf_record(reg);
- head_off = reg->off + reg->var_off.value;
+ head_off = reg->var_off.value;
field = btf_record_find(rec, head_off, head_field_type);
if (!field) {
verbose(env, "%s not found at offset=%u\n", head_type_name, head_off);
@@ -13276,7 +11941,7 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
return -EINVAL;
}
- node_off = reg->off + reg->var_off.value;
+ node_off = reg->var_off.value;
field = reg_find_field_offset(reg, node_off, node_field_type);
if (!field) {
verbose(env, "%s not found at offset=%u\n", node_type_name, node_off);
@@ -13381,11 +12046,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
bool is_ret_buf_sz = false;
int kf_arg_type;
- t = btf_type_skip_modifiers(btf, args[i].type, NULL);
-
- if (is_kfunc_arg_ignore(btf, &args[i]))
- continue;
-
if (is_kfunc_arg_prog_aux(btf, &args[i])) {
/* Reject repeated use bpf_prog_aux */
if (meta->arg_prog) {
@@ -13397,6 +12057,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
continue;
}
+ if (is_kfunc_arg_ignore(btf, &args[i]) || is_kfunc_arg_implicit(meta, i))
+ continue;
+
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+
if (btf_type_is_scalar(t)) {
if (reg->type != SCALAR_VALUE) {
verbose(env, "R%d is not a scalar\n", regno);
@@ -13448,7 +12113,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EINVAL;
}
- if ((register_is_null(reg) || type_may_be_null(reg->type)) &&
+ if ((bpf_register_is_null(reg) || type_may_be_null(reg->type)) &&
!is_kfunc_arg_nullable(meta->btf, &args[i])) {
verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
return -EACCES;
@@ -13525,7 +12190,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
}
}
fallthrough;
- case KF_ARG_PTR_TO_CTX:
case KF_ARG_PTR_TO_DYNPTR:
case KF_ARG_PTR_TO_ITER:
case KF_ARG_PTR_TO_LIST_HEAD:
@@ -13543,6 +12207,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_IRQ_FLAG:
case KF_ARG_PTR_TO_RES_SPIN_LOCK:
break;
+ case KF_ARG_PTR_TO_CTX:
+ arg_type = ARG_PTR_TO_CTX;
+ break;
default:
verifier_bug(env, "unknown kfunc arg type %d", kf_arg_type);
return -EFAULT;
@@ -13571,13 +12238,13 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
break;
case KF_ARG_PTR_TO_ALLOC_BTF_ID:
if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) {
- if (meta->func_id != special_kfunc_list[KF_bpf_obj_drop_impl]) {
- verbose(env, "arg#%d expected for bpf_obj_drop_impl()\n", i);
+ if (!is_bpf_obj_drop_kfunc(meta->func_id)) {
+ verbose(env, "arg#%d expected for bpf_obj_drop()\n", i);
return -EINVAL;
}
} else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) {
- if (meta->func_id != special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
- verbose(env, "arg#%d expected for bpf_percpu_obj_drop_impl()\n", i);
+ if (!is_bpf_percpu_obj_drop_kfunc(meta->func_id)) {
+ verbose(env, "arg#%d expected for bpf_percpu_obj_drop()\n", i);
return -EINVAL;
}
} else {
@@ -13703,7 +12370,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return ret;
break;
case KF_ARG_PTR_TO_RB_NODE:
- if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ if (is_bpf_rbtree_add_kfunc(meta->func_id)) {
if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
verbose(env, "arg#%d expected pointer to allocated object\n", i);
return -EINVAL;
@@ -13766,7 +12433,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
struct bpf_reg_state *size_reg = &regs[regno + 1];
const struct btf_param *size_arg = &args[i + 1];
- if (!register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) {
+ if (!bpf_register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) {
ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
if (ret < 0) {
verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
@@ -13899,10 +12566,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return 0;
}
-static int fetch_kfunc_arg_meta(struct bpf_verifier_env *env,
- s32 func_id,
- s16 offset,
- struct bpf_kfunc_call_arg_meta *meta)
+int bpf_fetch_kfunc_arg_meta(struct bpf_verifier_env *env,
+ s32 func_id,
+ s16 offset,
+ struct bpf_kfunc_call_arg_meta *meta)
{
struct bpf_kfunc_meta kfunc;
int err;
@@ -13925,6 +12592,194 @@ static int fetch_kfunc_arg_meta(struct bpf_verifier_env *env,
return 0;
}
+/*
+ * Determine how many bytes a helper accesses through a stack pointer at
+ * argument position @arg (0-based, corresponding to R1-R5).
+ *
+ * Returns:
+ * > 0 known read access size in bytes
+ * 0 doesn't read anything directly
+ * S64_MIN unknown
+ * < 0 known write access of (-return) bytes
+ */
+s64 bpf_helper_stack_access_bytes(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int arg, int insn_idx)
+{
+ struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+ const struct bpf_func_proto *fn;
+ enum bpf_arg_type at;
+ s64 size;
+
+ if (bpf_get_helper_proto(env, insn->imm, &fn) < 0)
+ return S64_MIN;
+
+ at = fn->arg_type[arg];
+
+ switch (base_type(at)) {
+ case ARG_PTR_TO_MAP_KEY:
+ case ARG_PTR_TO_MAP_VALUE: {
+ bool is_key = base_type(at) == ARG_PTR_TO_MAP_KEY;
+ u64 val;
+ int i, map_reg;
+
+ for (i = 0; i < arg; i++) {
+ if (base_type(fn->arg_type[i]) == ARG_CONST_MAP_PTR)
+ break;
+ }
+ if (i >= arg)
+ goto scan_all_maps;
+
+ map_reg = BPF_REG_1 + i;
+
+ if (!(aux->const_reg_map_mask & BIT(map_reg)))
+ goto scan_all_maps;
+
+ i = aux->const_reg_vals[map_reg];
+ if (i < env->used_map_cnt) {
+ size = is_key ? env->used_maps[i]->key_size
+ : env->used_maps[i]->value_size;
+ goto out;
+ }
+scan_all_maps:
+ /*
+ * Map pointer is not known at this call site (e.g. different
+ * maps on merged paths). Conservatively return the largest
+ * key_size or value_size across all maps used by the program.
+ */
+ val = 0;
+ for (i = 0; i < env->used_map_cnt; i++) {
+ struct bpf_map *map = env->used_maps[i];
+ u32 sz = is_key ? map->key_size : map->value_size;
+
+ if (sz > val)
+ val = sz;
+ if (map->inner_map_meta) {
+ sz = is_key ? map->inner_map_meta->key_size
+ : map->inner_map_meta->value_size;
+ if (sz > val)
+ val = sz;
+ }
+ }
+ if (!val)
+ return S64_MIN;
+ size = val;
+ goto out;
+ }
+ case ARG_PTR_TO_MEM:
+ if (at & MEM_FIXED_SIZE) {
+ size = fn->arg_size[arg];
+ goto out;
+ }
+ if (arg + 1 < ARRAY_SIZE(fn->arg_type) &&
+ arg_type_is_mem_size(fn->arg_type[arg + 1])) {
+ int size_reg = BPF_REG_1 + arg + 1;
+
+ if (aux->const_reg_mask & BIT(size_reg)) {
+ size = (s64)aux->const_reg_vals[size_reg];
+ goto out;
+ }
+ /*
+ * Size arg is const on each path but differs across merged
+ * paths. MAX_BPF_STACK is a safe upper bound for reads.
+ */
+ if (at & MEM_UNINIT)
+ return 0;
+ return MAX_BPF_STACK;
+ }
+ return S64_MIN;
+ case ARG_PTR_TO_DYNPTR:
+ size = BPF_DYNPTR_SIZE;
+ break;
+ case ARG_PTR_TO_STACK:
+ /*
+ * Only used by bpf_calls_callback() helpers. The helper itself
+ * doesn't access stack. The callback subprog does and it's
+ * analyzed separately.
+ */
+ return 0;
+ default:
+ return S64_MIN;
+ }
+out:
+ /*
+ * MEM_UNINIT args are write-only: the helper initializes the
+ * buffer without reading it.
+ */
+ if (at & MEM_UNINIT)
+ return -size;
+ return size;
+}
+
+/*
+ * Determine how many bytes a kfunc accesses through a stack pointer at
+ * argument position @arg (0-based, corresponding to R1-R5).
+ *
+ * Returns:
+ * > 0 known read access size in bytes
+ * 0 doesn't access memory through that argument (ex: not a pointer)
+ * S64_MIN unknown
+ * < 0 known write access of (-return) bytes
+ */
+s64 bpf_kfunc_stack_access_bytes(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int arg, int insn_idx)
+{
+ struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+ struct bpf_kfunc_call_arg_meta meta;
+ const struct btf_param *args;
+ const struct btf_type *t, *ref_t;
+ const struct btf *btf;
+ u32 nargs, type_size;
+ s64 size;
+
+ if (bpf_fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta) < 0)
+ return S64_MIN;
+
+ btf = meta.btf;
+ args = btf_params(meta.func_proto);
+ nargs = btf_type_vlen(meta.func_proto);
+ if (arg >= nargs)
+ return 0;
+
+ t = btf_type_skip_modifiers(btf, args[arg].type, NULL);
+ if (!btf_type_is_ptr(t))
+ return 0;
+
+ /* dynptr: fixed 16-byte on-stack representation */
+ if (is_kfunc_arg_dynptr(btf, &args[arg])) {
+ size = BPF_DYNPTR_SIZE;
+ goto out;
+ }
+
+ /* ptr + __sz/__szk pair: size is in the next register */
+ if (arg + 1 < nargs &&
+ (btf_param_match_suffix(btf, &args[arg + 1], "__sz") ||
+ btf_param_match_suffix(btf, &args[arg + 1], "__szk"))) {
+ int size_reg = BPF_REG_1 + arg + 1;
+
+ if (aux->const_reg_mask & BIT(size_reg)) {
+ size = (s64)aux->const_reg_vals[size_reg];
+ goto out;
+ }
+ return MAX_BPF_STACK;
+ }
+
+ /* fixed-size pointed-to type: resolve via BTF */
+ ref_t = btf_type_skip_modifiers(btf, t->type, NULL);
+ if (!IS_ERR(btf_resolve_size(btf, ref_t, &type_size))) {
+ size = type_size;
+ goto out;
+ }
+
+ return S64_MIN;
+out:
+ /* KF_ITER_NEW kfuncs initialize the iterator state at arg 0 */
+ if (arg == 0 && meta.kfunc_flags & KF_ITER_NEW)
+ return -size;
+ if (is_kfunc_arg_uninit(btf, &args[arg]))
+ return -size;
+ return size;
+}
+
/* check special kfuncs and return:
* 1 - not fall-through to 'else' branch, continue verification
* 0 - fall-through to 'else' branch
@@ -13940,13 +12795,12 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca
if (meta->btf != btf_vmlinux)
return 0;
- if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
- meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (is_bpf_obj_new_kfunc(meta->func_id) || is_bpf_percpu_obj_new_kfunc(meta->func_id)) {
struct btf_struct_meta *struct_meta;
struct btf *ret_btf;
u32 ret_btf_id;
- if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
+ if (is_bpf_obj_new_kfunc(meta->func_id) && !bpf_global_ma_set)
return -ENOMEM;
if (((u64)(u32)meta->arg_constant.value) != meta->arg_constant.value) {
@@ -13969,7 +12823,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca
return -EINVAL;
}
- if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (is_bpf_percpu_obj_new_kfunc(meta->func_id)) {
if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
@@ -13999,7 +12853,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca
}
struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
- if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (is_bpf_percpu_obj_new_kfunc(meta->func_id)) {
if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n");
return -EINVAL;
@@ -14015,12 +12869,12 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca
regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
regs[BPF_REG_0].btf = ret_btf;
regs[BPF_REG_0].btf_id = ret_btf_id;
- if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl])
+ if (is_bpf_percpu_obj_new_kfunc(meta->func_id))
regs[BPF_REG_0].type |= MEM_PERCPU;
insn_aux->obj_new_size = ret_t->size;
insn_aux->kptr_struct_meta = struct_meta;
- } else if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
+ } else if (is_bpf_refcount_acquire_kfunc(meta->func_id)) {
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
regs[BPF_REG_0].btf = meta->arg_btf;
@@ -14106,6 +12960,8 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca
}
static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name);
+static int process_bpf_exit_full(struct bpf_verifier_env *env,
+ bool *do_print_state, bool exception_exit);
static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
@@ -14125,7 +12981,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (!insn->imm)
return 0;
- err = fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta);
+ err = bpf_fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta);
if (err == -EACCES && meta.func_name)
verbose(env, "calling kernel function %s is not allowed\n", meta.func_name);
if (err)
@@ -14134,7 +12990,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
func_name = meta.func_name;
insn_aux = &env->insn_aux_data[insn_idx];
- insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
+ insn_aux->is_iter_next = bpf_is_iter_next_kfunc(&meta);
if (!insn->off &&
(insn->imm == special_kfunc_list[KF_bpf_res_spin_lock] ||
@@ -14152,7 +13008,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* Clear r0-r5 registers in forked state */
for (i = 0; i < CALLER_SAVED_REGS; i++)
- mark_reg_not_init(env, regs, caller_saved[i]);
+ bpf_mark_reg_not_init(env, &regs[caller_saved[i]]);
mark_reg_unknown(env, regs, BPF_REG_0);
err = __mark_reg_s32_range(env, regs, BPF_REG_0, -MAX_ERRNO, -1);
@@ -14171,7 +13027,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EACCES;
}
- sleepable = is_kfunc_sleepable(&meta);
+ sleepable = bpf_is_kfunc_sleepable(&meta);
if (sleepable && !in_sleepable(env)) {
verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name);
return -EACCES;
@@ -14186,7 +13042,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (err < 0)
return err;
- if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ if (is_bpf_rbtree_add_kfunc(meta.func_id)) {
err = push_callback_call(env, insn, insn_idx, meta.subprogno,
set_rbtree_add_callback_state);
if (err) {
@@ -14246,34 +13102,24 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}));
}
- } else if (sleepable && env->cur_state->active_rcu_locks) {
- verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
- return -EACCES;
- }
-
- if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
- verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
- return -EACCES;
- }
-
- if (env->cur_state->active_preempt_locks) {
- if (preempt_disable) {
- env->cur_state->active_preempt_locks++;
- } else if (preempt_enable) {
- env->cur_state->active_preempt_locks--;
- } else if (sleepable) {
- verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name);
- return -EACCES;
- }
} else if (preempt_disable) {
env->cur_state->active_preempt_locks++;
} else if (preempt_enable) {
- verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name);
- return -EINVAL;
+ if (env->cur_state->active_preempt_locks == 0) {
+ verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name);
+ return -EINVAL;
+ }
+ env->cur_state->active_preempt_locks--;
}
- if (env->cur_state->active_irq_id && sleepable) {
- verbose(env, "kernel func %s is sleepable within IRQ-disabled region\n", func_name);
+ if (sleepable && !in_sleepable_context(env)) {
+ verbose(env, "kernel func %s is sleepable within %s\n",
+ func_name, non_sleepable_context_description(env));
+ return -EACCES;
+ }
+
+ if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
+ verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
return -EACCES;
}
@@ -14300,11 +13146,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return err;
}
- if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
- meta.func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
- meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ if (is_bpf_list_push_kfunc(meta.func_id) || is_bpf_rbtree_add_kfunc(meta.func_id)) {
release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
- insn_aux->insert_off = regs[BPF_REG_2].off;
+ insn_aux->insert_off = regs[BPF_REG_2].var_off.value;
insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id);
err = ref_convert_owning_non_owning(env, release_ref_obj_id);
if (err) {
@@ -14342,7 +13186,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
for (i = 0; i < CALLER_SAVED_REGS; i++) {
u32 regno = caller_saved[i];
- mark_reg_not_init(env, regs, regno);
+ bpf_mark_reg_not_init(env, &regs[regno]);
regs[regno].subreg_def = DEF_NOT_SUBREG;
}
@@ -14350,11 +13194,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);
if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
- /* Only exception is bpf_obj_new_impl */
if (meta.btf != btf_vmlinux ||
- (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] &&
- meta.func_id != special_kfunc_list[KF_bpf_percpu_obj_new_impl] &&
- meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) {
+ (!is_bpf_obj_new_kfunc(meta.func_id) &&
+ !is_bpf_percpu_obj_new_kfunc(meta.func_id) &&
+ !is_bpf_refcount_acquire_kfunc(meta.func_id))) {
verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
return -EINVAL;
}
@@ -14414,7 +13257,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache])
type |= PTR_UNTRUSTED;
else if (is_kfunc_rcu_protected(&meta) ||
- (is_iter_next_kfunc(&meta) &&
+ (bpf_is_iter_next_kfunc(&meta) &&
(get_iter_from_state(env->cur_state, &meta)
->type & MEM_RCU))) {
/*
@@ -14465,8 +13308,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].id = ++env->id_gen;
} else if (btf_type_is_void(t)) {
if (meta.btf == btf_vmlinux) {
- if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
- meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
+ if (is_bpf_obj_drop_kfunc(meta.func_id) ||
+ is_bpf_percpu_obj_drop_kfunc(meta.func_id)) {
insn_aux->kptr_struct_meta =
btf_find_struct_meta(meta.arg_btf,
meta.arg_btf_id);
@@ -14474,7 +13317,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
- if (is_kfunc_pkt_changing(&meta))
+ if (bpf_is_kfunc_pkt_changing(&meta))
clear_all_pkt_pointers(env);
nargs = btf_type_vlen(meta.func_proto);
@@ -14486,11 +13329,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (btf_type_is_ptr(t))
mark_btf_func_reg_size(env, regno, sizeof(void *));
else
- /* scalar. ensured by btf_check_kfunc_arg_match() */
+ /* scalar. ensured by check_kfunc_args() */
mark_btf_func_reg_size(env, regno, t->size);
}
- if (is_iter_next_kfunc(&meta)) {
+ if (bpf_is_iter_next_kfunc(&meta)) {
err = process_iter_next_call(env, insn_idx, &meta);
if (err)
return err;
@@ -14499,12 +13342,15 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie])
env->prog->call_session_cookie = true;
+ if (is_bpf_throw_kfunc(insn))
+ return process_bpf_exit_full(env, NULL, true);
+
return 0;
}
-static bool check_reg_sane_offset(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg,
- enum bpf_reg_type type)
+static bool check_reg_sane_offset_scalar(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ enum bpf_reg_type type)
{
bool known = tnum_is_const(reg->var_off);
s64 val = reg->var_off.value;
@@ -14516,12 +13362,6 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
return false;
}
- if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
- verbose(env, "%s pointer offset %d is not allowed\n",
- reg_type_str(env, type), reg->off);
- return false;
- }
-
if (smin == S64_MIN) {
verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
reg_type_str(env, type));
@@ -14537,6 +13377,29 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
return true;
}
+static bool check_reg_sane_offset_ptr(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ enum bpf_reg_type type)
+{
+ bool known = tnum_is_const(reg->var_off);
+ s64 val = reg->var_off.value;
+ s64 smin = reg->smin_value;
+
+ if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
+ verbose(env, "%s pointer offset %lld is not allowed\n",
+ reg_type_str(env, type), val);
+ return false;
+ }
+
+ if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
+ verbose(env, "%s pointer offset %lld is not allowed\n",
+ reg_type_str(env, type), smin);
+ return false;
+ }
+
+ return true;
+}
+
enum {
REASON_BOUNDS = -1,
REASON_TYPE = -2,
@@ -14558,13 +13421,11 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
* currently prohibited for unprivileged.
*/
max = MAX_BPF_STACK + mask_to_left;
- ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off);
+ ptr_limit = -ptr_reg->var_off.value;
break;
case PTR_TO_MAP_VALUE:
max = ptr_reg->map_ptr->value_size;
- ptr_limit = (mask_to_left ?
- ptr_reg->smin_value :
- ptr_reg->umax_value) + ptr_reg->off;
+ ptr_limit = mask_to_left ? ptr_reg->smin_value : ptr_reg->umax_value;
break;
default:
return REASON_TYPE;
@@ -14795,9 +13656,6 @@ static int sanitize_err(struct bpf_verifier_env *env,
* Variable offset is prohibited for unprivileged mode for simplicity since it
* requires corresponding support in Spectre masking for stack ALU. See also
* retrieve_ptr_limit().
- *
- *
- * 'off' includes 'reg->off'.
*/
static int check_stack_access_for_ptr_arithmetic(
struct bpf_verifier_env *env,
@@ -14838,11 +13696,11 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env,
switch (dst_reg->type) {
case PTR_TO_STACK:
if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg,
- dst_reg->off + dst_reg->var_off.value))
+ dst_reg->var_off.value))
return -EACCES;
break;
case PTR_TO_MAP_VALUE:
- if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) {
+ if (check_map_access(env, dst, 0, 1, false, ACCESS_HELPER)) {
verbose(env, "R%d pointer arithmetic of map value goes out of range, "
"prohibited for !root\n", dst);
return -EACCES;
@@ -14950,8 +13808,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg->type = ptr_reg->type;
dst_reg->id = ptr_reg->id;
- if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) ||
- !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
+ if (!check_reg_sane_offset_scalar(env, off_reg, ptr_reg->type) ||
+ !check_reg_sane_offset_ptr(env, ptr_reg, ptr_reg->type))
return -EINVAL;
/* pointer types do not carry 32-bit bounds at the moment. */
@@ -14966,23 +13824,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
switch (opcode) {
case BPF_ADD:
- /* We can take a fixed offset as long as it doesn't overflow
- * the s32 'off' field
- */
- if (known && (ptr_reg->off + smin_val ==
- (s64)(s32)(ptr_reg->off + smin_val))) {
- /* pointer += K. Accumulate it into fixed offset */
- dst_reg->smin_value = smin_ptr;
- dst_reg->smax_value = smax_ptr;
- dst_reg->umin_value = umin_ptr;
- dst_reg->umax_value = umax_ptr;
- dst_reg->var_off = ptr_reg->var_off;
- dst_reg->off = ptr_reg->off + smin_val;
- dst_reg->raw = ptr_reg->raw;
- break;
- }
- /* A new variable offset is created. Note that off_reg->off
- * == 0, since it's a scalar.
+ /*
* dst_reg gets the pointer type and since some positive
* integer value was added to the pointer, give it a new 'id'
* if it's a PTR_TO_PACKET.
@@ -15001,12 +13843,18 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg->umax_value = U64_MAX;
}
dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
- dst_reg->off = ptr_reg->off;
dst_reg->raw = ptr_reg->raw;
if (reg_is_pkt_pointer(ptr_reg)) {
- dst_reg->id = ++env->id_gen;
- /* something was added to pkt_ptr, set range to zero */
- memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
+ if (!known)
+ dst_reg->id = ++env->id_gen;
+ /*
+ * Clear range for unknown addends since we can't know
+ * where the pkt pointer ended up. Also clear AT_PKT_END /
+ * BEYOND_PKT_END from prior comparison as any pointer
+ * arithmetic invalidates them.
+ */
+ if (!known || dst_reg->range < 0)
+ memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
}
break;
case BPF_SUB:
@@ -15025,19 +13873,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst);
return -EACCES;
}
- if (known && (ptr_reg->off - smin_val ==
- (s64)(s32)(ptr_reg->off - smin_val))) {
- /* pointer -= K. Subtract it from fixed offset */
- dst_reg->smin_value = smin_ptr;
- dst_reg->smax_value = smax_ptr;
- dst_reg->umin_value = umin_ptr;
- dst_reg->umax_value = umax_ptr;
- dst_reg->var_off = ptr_reg->var_off;
- dst_reg->id = ptr_reg->id;
- dst_reg->off = ptr_reg->off - smin_val;
- dst_reg->raw = ptr_reg->raw;
- break;
- }
/* A new variable offset is created. If the subtrahend is known
* nonnegative, then any reg->range we had before is still good.
*/
@@ -15057,12 +13892,18 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg->umax_value = umax_ptr - umin_val;
}
dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
- dst_reg->off = ptr_reg->off;
dst_reg->raw = ptr_reg->raw;
if (reg_is_pkt_pointer(ptr_reg)) {
- dst_reg->id = ++env->id_gen;
- /* something was added to pkt_ptr, set range to zero */
- if (smin_val < 0)
+ if (!known)
+ dst_reg->id = ++env->id_gen;
+ /*
+ * Clear range if the subtrahend may be negative since
+ * pkt pointer could move past its bounds. A positive
+ * subtrahend moves it backwards keeping positive range
+ * intact. Also clear AT_PKT_END / BEYOND_PKT_END from
+ * prior comparison as arithmetic invalidates them.
+ */
+ if ((!known && smin_val < 0) || dst_reg->range < 0)
memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
}
break;
@@ -15080,7 +13921,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
return -EACCES;
}
- if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
+ if (!check_reg_sane_offset_ptr(env, dst_reg, ptr_reg->type))
return -EINVAL;
reg_bounds_sync(dst_reg);
bounds_ret = sanitize_check_bounds(env, insn, dst_reg);
@@ -15937,7 +14778,7 @@ static void scalar_byte_swap(struct bpf_reg_state *dst_reg, struct bpf_insn *ins
* any existing ties and avoid incorrect bounds propagation.
*/
if (need_bswap || insn->imm == 16 || insn->imm == 32)
- dst_reg->id = 0;
+ clear_scalar_id(dst_reg);
if (need_bswap) {
if (insn->imm == 16)
@@ -16210,11 +15051,20 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
int err;
dst_reg = &regs[insn->dst_reg];
- src_reg = NULL;
+ if (BPF_SRC(insn->code) == BPF_X)
+ src_reg = &regs[insn->src_reg];
+ else
+ src_reg = NULL;
- if (dst_reg->type == PTR_TO_ARENA) {
+ /* Case where at least one operand is an arena. */
+ if (dst_reg->type == PTR_TO_ARENA || (src_reg && src_reg->type == PTR_TO_ARENA)) {
struct bpf_insn_aux_data *aux = cur_aux(env);
+ if (dst_reg->type != PTR_TO_ARENA)
+ *dst_reg = *src_reg;
+
+ dst_reg->subreg_def = env->insn_idx + 1;
+
if (BPF_CLASS(insn->code) == BPF_ALU64)
/*
* 32-bit operations zero upper bits automatically.
@@ -16230,7 +15080,6 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
ptr_reg = dst_reg;
if (BPF_SRC(insn->code) == BPF_X) {
- src_reg = &regs[insn->src_reg];
if (src_reg->type != SCALAR_VALUE) {
if (dst_reg->type != SCALAR_VALUE) {
/* Combining two pointers by any ALU op yields
@@ -16313,7 +15162,8 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
*/
if (env->bpf_capable &&
(BPF_OP(insn->code) == BPF_ADD || BPF_OP(insn->code) == BPF_SUB) &&
- dst_reg->id && is_reg_const(src_reg, alu32)) {
+ dst_reg->id && is_reg_const(src_reg, alu32) &&
+ !(BPF_SRC(insn->code) == BPF_X && insn->src_reg == insn->dst_reg)) {
u64 val = reg_const_value(src_reg, alu32);
s32 off;
@@ -16338,21 +15188,20 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
* we cannot accumulate another val into rx->off.
*/
clear_id:
- dst_reg->off = 0;
- dst_reg->id = 0;
+ clear_scalar_id(dst_reg);
} else {
if (alu32)
dst_reg->id |= BPF_ADD_CONST32;
else
dst_reg->id |= BPF_ADD_CONST64;
- dst_reg->off = off;
+ dst_reg->delta = off;
}
} else {
/*
* Make sure ID is cleared otherwise dst_reg min/max could be
* incorrectly propagated into other registers by sync_linked_regs()
*/
- dst_reg->id = 0;
+ clear_scalar_id(dst_reg);
}
return 0;
}
@@ -16365,23 +15214,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
int err;
if (opcode == BPF_END || opcode == BPF_NEG) {
- if (opcode == BPF_NEG) {
- if (BPF_SRC(insn->code) != BPF_K ||
- insn->src_reg != BPF_REG_0 ||
- insn->off != 0 || insn->imm != 0) {
- verbose(env, "BPF_NEG uses reserved fields\n");
- return -EINVAL;
- }
- } else {
- if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
- (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
- (BPF_CLASS(insn->code) == BPF_ALU64 &&
- BPF_SRC(insn->code) != BPF_TO_LE)) {
- verbose(env, "BPF_END uses reserved fields\n");
- return -EINVAL;
- }
- }
-
/* check src operand */
err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
@@ -16394,8 +15226,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- if ((opcode == BPF_NEG || opcode == BPF_END) &&
- regs[insn->dst_reg].type == SCALAR_VALUE) {
+ if (regs[insn->dst_reg].type == SCALAR_VALUE) {
err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
err = err ?: adjust_scalar_min_max_vals(env, insn,
&regs[insn->dst_reg],
@@ -16409,38 +15240,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else if (opcode == BPF_MOV) {
if (BPF_SRC(insn->code) == BPF_X) {
- if (BPF_CLASS(insn->code) == BPF_ALU) {
- if ((insn->off != 0 && insn->off != 8 && insn->off != 16) ||
- insn->imm) {
- verbose(env, "BPF_MOV uses reserved fields\n");
- return -EINVAL;
- }
- } else if (insn->off == BPF_ADDR_SPACE_CAST) {
- if (insn->imm != 1 && insn->imm != 1u << 16) {
- verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n");
- return -EINVAL;
- }
+ if (insn->off == BPF_ADDR_SPACE_CAST) {
if (!env->prog->aux->arena) {
verbose(env, "addr_space_cast insn can only be used in a program that has an associated arena\n");
return -EINVAL;
}
- } else {
- if ((insn->off != 0 && insn->off != 8 && insn->off != 16 &&
- insn->off != 32) || insn->imm) {
- verbose(env, "BPF_MOV uses reserved fields\n");
- return -EINVAL;
- }
}
/* check src operand */
err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
- } else {
- if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
- verbose(env, "BPF_MOV uses reserved fields\n");
- return -EINVAL;
- }
}
/* check dest operand, mark as required later */
@@ -16483,7 +15293,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
assign_scalar_id_before_mov(env, src_reg);
copy_register_state(dst_reg, src_reg);
if (!no_sext)
- dst_reg->id = 0;
+ clear_scalar_id(dst_reg);
coerce_reg_to_size_sx(dst_reg, insn->off >> 3);
dst_reg->subreg_def = DEF_NOT_SUBREG;
} else {
@@ -16509,7 +15319,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
* propagated into src_reg by sync_linked_regs()
*/
if (!is_src_reg_u32)
- dst_reg->id = 0;
+ clear_scalar_id(dst_reg);
dst_reg->subreg_def = env->insn_idx + 1;
} else {
/* case: W1 = (s8, s16)W2 */
@@ -16519,7 +15329,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
assign_scalar_id_before_mov(env, src_reg);
copy_register_state(dst_reg, src_reg);
if (!no_sext)
- dst_reg->id = 0;
+ clear_scalar_id(dst_reg);
dst_reg->subreg_def = env->insn_idx + 1;
coerce_subreg_to_size_sx(dst_reg, insn->off >> 3);
}
@@ -16546,28 +15356,13 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
}
- } else if (opcode > BPF_END) {
- verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
- return -EINVAL;
-
} else { /* all other ALU ops: and, sub, xor, add, ... */
if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->imm != 0 || (insn->off != 0 && insn->off != 1) ||
- (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
- verbose(env, "BPF_ALU uses reserved fields\n");
- return -EINVAL;
- }
/* check src1 operand */
err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
- } else {
- if (insn->src_reg != BPF_REG_0 || (insn->off != 0 && insn->off != 1) ||
- (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
- verbose(env, "BPF_ALU uses reserved fields\n");
- return -EINVAL;
- }
}
/* check src2 operand */
@@ -16610,19 +15405,17 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
struct bpf_reg_state *reg;
int new_range;
- if (dst_reg->off < 0 ||
- (dst_reg->off == 0 && range_right_open))
+ if (dst_reg->umax_value == 0 && range_right_open)
/* This doesn't give us any range */
return;
- if (dst_reg->umax_value > MAX_PACKET_OFF ||
- dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF)
+ if (dst_reg->umax_value > MAX_PACKET_OFF)
/* Risk of overflow. For instance, ptr + (1<<63) may be less
* than pkt_end, but that's because it's also less than pkt.
*/
return;
- new_range = dst_reg->off;
+ new_range = dst_reg->umax_value;
if (range_right_open)
new_range++;
@@ -16671,7 +15464,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
/* If our ids match, then we must have the same max_value. And we
* don't care about the other reg's fixed offset, since if it's too big
* the range won't allow anything.
- * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
+ * dst_reg->umax_value is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
bpf_for_each_reg_in_vstate(vstate, state, reg, ({
if (reg->type == type && reg->id == dst_reg->id)
@@ -16680,11 +15473,50 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
}));
}
+static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+ u8 opcode, bool is_jmp32);
+static u8 rev_opcode(u8 opcode);
+
+/*
+ * Learn more information about live branches by simulating refinement on both branches.
+ * regs_refine_cond_op() is sound, so producing ill-formed register bounds for the branch means
+ * that branch is dead.
+ */
+static int simulate_both_branches_taken(struct bpf_verifier_env *env, u8 opcode, bool is_jmp32)
+{
+ /* Fallthrough (FALSE) branch */
+ regs_refine_cond_op(&env->false_reg1, &env->false_reg2, rev_opcode(opcode), is_jmp32);
+ reg_bounds_sync(&env->false_reg1);
+ reg_bounds_sync(&env->false_reg2);
+ /*
+ * If there is a range bounds violation in *any* of the abstract values in either
+ * reg_states in the FALSE branch (i.e. reg1, reg2), the FALSE branch must be dead. Only
+ * TRUE branch will be taken.
+ */
+ if (range_bounds_violation(&env->false_reg1) || range_bounds_violation(&env->false_reg2))
+ return 1;
+
+ /* Jump (TRUE) branch */
+ regs_refine_cond_op(&env->true_reg1, &env->true_reg2, opcode, is_jmp32);
+ reg_bounds_sync(&env->true_reg1);
+ reg_bounds_sync(&env->true_reg2);
+ /*
+ * If there is a range bounds violation in *any* of the abstract values in either
+ * reg_states in the TRUE branch (i.e. true_reg1, true_reg2), the TRUE branch must be dead.
+ * Only FALSE branch will be taken.
+ */
+ if (range_bounds_violation(&env->true_reg1) || range_bounds_violation(&env->true_reg2))
+ return 0;
+
+ /* Both branches are possible, we can't determine which one will be taken. */
+ return -1;
+}
+
/*
* <reg1> <op> <reg2>, currently assuming reg2 is a constant
*/
-static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
- u8 opcode, bool is_jmp32)
+static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_state *reg1,
+ struct bpf_reg_state *reg2, u8 opcode, bool is_jmp32)
{
struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off;
struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
@@ -16836,7 +15668,7 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta
break;
}
- return -1;
+ return simulate_both_branches_taken(env, opcode, is_jmp32);
}
static int flip_opcode(u32 opcode)
@@ -16907,8 +15739,8 @@ static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg,
* -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value
* range [0,10]
*/
-static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
- u8 opcode, bool is_jmp32)
+static int is_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_state *reg1,
+ struct bpf_reg_state *reg2, u8 opcode, bool is_jmp32)
{
if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32)
return is_pkt_ptr_branch_taken(reg1, reg2, opcode);
@@ -16946,7 +15778,7 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg
}
/* now deal with two scalars, but not necessarily constants */
- return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32);
+ return is_scalar_branch_taken(env, reg1, reg2, opcode, is_jmp32);
}
/* Opcode that corresponds to a *false* branch condition.
@@ -17037,8 +15869,8 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state
/* u32_min_value is not equal to 0xffffffff at this point,
* because otherwise u32_max_value is 0xffffffff as well,
* in such a case both reg1 and reg2 would be constants,
- * jump would be predicted and reg_set_min_max() won't
- * be called.
+ * jump would be predicted and regs_refine_cond_op()
+ * wouldn't be called.
*
* Same reasoning works for all {u,s}{min,max}{32,64} cases
* below.
@@ -17145,49 +15977,15 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state
}
}
-/* Adjusts the register min/max values in the case that the dst_reg and
- * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K
- * check, in which case we have a fake SCALAR_VALUE representing insn->imm).
- * Technically we can do similar adjustments for pointers to the same object,
- * but we don't support that right now.
- */
-static int reg_set_min_max(struct bpf_verifier_env *env,
- struct bpf_reg_state *true_reg1,
- struct bpf_reg_state *true_reg2,
- struct bpf_reg_state *false_reg1,
- struct bpf_reg_state *false_reg2,
- u8 opcode, bool is_jmp32)
+/* Check for invariant violations on the registers for both branches of a condition */
+static int regs_bounds_sanity_check_branches(struct bpf_verifier_env *env)
{
int err;
- /* If either register is a pointer, we can't learn anything about its
- * variable offset from the compare (unless they were a pointer into
- * the same object, but we don't bother with that).
- */
- if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE)
- return 0;
-
- /* We compute branch direction for same SCALAR_VALUE registers in
- * is_scalar_branch_taken(). For unknown branch directions (e.g., BPF_JSET)
- * on the same registers, we don't need to adjust the min/max values.
- */
- if (false_reg1 == false_reg2)
- return 0;
-
- /* fallthrough (FALSE) branch */
- regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32);
- reg_bounds_sync(false_reg1);
- reg_bounds_sync(false_reg2);
-
- /* jump (TRUE) branch */
- regs_refine_cond_op(true_reg1, true_reg2, opcode, is_jmp32);
- reg_bounds_sync(true_reg1);
- reg_bounds_sync(true_reg2);
-
- err = reg_bounds_sanity_check(env, true_reg1, "true_reg1");
- err = err ?: reg_bounds_sanity_check(env, true_reg2, "true_reg2");
- err = err ?: reg_bounds_sanity_check(env, false_reg1, "false_reg1");
- err = err ?: reg_bounds_sanity_check(env, false_reg2, "false_reg2");
+ err = reg_bounds_sanity_check(env, &env->true_reg1, "true_reg1");
+ err = err ?: reg_bounds_sanity_check(env, &env->true_reg2, "true_reg2");
+ err = err ?: reg_bounds_sanity_check(env, &env->false_reg1, "false_reg1");
+ err = err ?: reg_bounds_sanity_check(env, &env->false_reg2, "false_reg2");
return err;
}
@@ -17197,29 +15995,24 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
{
if (type_may_be_null(reg->type) && reg->id == id &&
(is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) {
- /* Old offset (both fixed and variable parts) should have been
- * known-zero, because we don't allow pointer arithmetic on
- * pointers that might be NULL. If we see this happening, don't
- * convert the register.
+ /* Old offset should have been known-zero, because we don't
+ * allow pointer arithmetic on pointers that might be NULL.
+ * If we see this happening, don't convert the register.
*
* But in some cases, some helpers that return local kptrs
- * advance offset for the returned pointer. In those cases, it
- * is fine to expect to see reg->off.
+ * advance offset for the returned pointer. In those cases,
+ * it is fine to expect to see reg->var_off.
*/
- if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0)))
- return;
if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) &&
- WARN_ON_ONCE(reg->off))
+ WARN_ON_ONCE(!tnum_equals_const(reg->var_off, 0)))
return;
-
if (is_null) {
- reg->type = SCALAR_VALUE;
/* We don't need id and ref_obj_id from this point
* onwards anymore, thus we should better reset it,
* so that state pruning has chances to take effect.
*/
- reg->id = 0;
- reg->ref_obj_id = 0;
+ __mark_reg_known_zero(reg);
+ reg->type = SCALAR_VALUE;
return;
}
@@ -17380,7 +16173,7 @@ static void __collect_linked_regs(struct linked_regs *reg_set, struct bpf_reg_st
e->is_reg = is_reg;
e->regno = spi_or_reg;
} else {
- reg->id = 0;
+ clear_scalar_id(reg);
}
}
@@ -17401,7 +16194,7 @@ static void collect_linked_regs(struct bpf_verifier_env *env,
id = id & ~BPF_ADD_CONST;
for (i = vstate->curframe; i >= 0; i--) {
- live_regs = aux[frame_insn_idx(vstate, i)].live_regs_before;
+ live_regs = aux[bpf_frame_insn_idx(vstate, i)].live_regs_before;
func = vstate->frame[i];
for (j = 0; j < BPF_REG_FP; j++) {
if (!(live_regs & BIT(j)))
@@ -17410,7 +16203,7 @@ static void collect_linked_regs(struct bpf_verifier_env *env,
__collect_linked_regs(linked_regs, reg, id, i, j, true);
}
for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
- if (!is_spilled_reg(&func->stack[j]))
+ if (!bpf_is_spilled_reg(&func->stack[j]))
continue;
reg = &func->stack[j].spilled_ptr;
__collect_linked_regs(linked_regs, reg, id, i, j, false);
@@ -17444,18 +16237,18 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s
if (((reg->id ^ known_reg->id) & BPF_ADD_CONST) == BPF_ADD_CONST)
continue;
if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) ||
- reg->off == known_reg->off) {
+ reg->delta == known_reg->delta) {
s32 saved_subreg_def = reg->subreg_def;
copy_register_state(reg, known_reg);
reg->subreg_def = saved_subreg_def;
} else {
s32 saved_subreg_def = reg->subreg_def;
- s32 saved_off = reg->off;
+ s32 saved_off = reg->delta;
u32 saved_id = reg->id;
fake_reg.type = SCALAR_VALUE;
- __mark_reg_known(&fake_reg, (s64)reg->off - (s64)known_reg->off);
+ __mark_reg_known(&fake_reg, (s64)reg->delta - (s64)known_reg->delta);
/* reg = known_reg; reg += delta */
copy_register_state(reg, known_reg);
@@ -17463,7 +16256,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s
* Must preserve off, id and subreg_def flag,
* otherwise another sync_linked_regs() will be incorrect.
*/
- reg->off = saved_off;
+ reg->delta = saved_off;
reg->id = saved_id;
reg->subreg_def = saved_subreg_def;
@@ -17506,12 +16299,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
int idx = *insn_idx;
- if (insn->code != (BPF_JMP | BPF_JCOND) ||
- insn->src_reg != BPF_MAY_GOTO ||
- insn->dst_reg || insn->imm) {
- verbose(env, "invalid may_goto imm %d\n", insn->imm);
- return -EINVAL;
- }
prev_st = find_prev_entry(env, cur_st->parent, idx);
/* branch out 'fallthrough' insn as a new state to explore */
@@ -17533,11 +16320,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
dst_reg = &regs[insn->dst_reg];
if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->imm != 0) {
- verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
- return -EINVAL;
- }
-
/* check src1 operand */
err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
@@ -17556,10 +16338,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (dst_reg->type == PTR_TO_STACK)
insn_flags |= INSN_F_DST_REG_STACK;
} else {
- if (insn->src_reg != BPF_REG_0) {
- verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
- return -EINVAL;
- }
src_reg = &env->fake_reg[0];
memset(src_reg, 0, sizeof(*src_reg));
src_reg->type = SCALAR_VALUE;
@@ -17570,13 +16348,17 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
if (insn_flags) {
- err = push_jmp_history(env, this_branch, insn_flags, 0);
+ err = bpf_push_jmp_history(env, this_branch, insn_flags, 0);
if (err)
return err;
}
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
- pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
+ copy_register_state(&env->false_reg1, dst_reg);
+ copy_register_state(&env->false_reg2, src_reg);
+ copy_register_state(&env->true_reg1, dst_reg);
+ copy_register_state(&env->true_reg2, src_reg);
+ pred = is_branch_taken(env, dst_reg, src_reg, opcode, is_jmp32);
if (pred >= 0) {
/* If we get here with a dst_reg pointer type it is because
* above is_branch_taken() special cased the 0 comparison.
@@ -17630,7 +16412,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (dst_reg->type == SCALAR_VALUE && dst_reg->id)
collect_linked_regs(env, this_branch, dst_reg->id, &linked_regs);
if (linked_regs.cnt > 1) {
- err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
+ err = bpf_push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
if (err)
return err;
}
@@ -17640,27 +16422,16 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return PTR_ERR(other_branch);
other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
- if (BPF_SRC(insn->code) == BPF_X) {
- err = reg_set_min_max(env,
- &other_branch_regs[insn->dst_reg],
- &other_branch_regs[insn->src_reg],
- dst_reg, src_reg, opcode, is_jmp32);
- } else /* BPF_SRC(insn->code) == BPF_K */ {
- /* reg_set_min_max() can mangle the fake_reg. Make a copy
- * so that these are two different memory locations. The
- * src_reg is not used beyond here in context of K.
- */
- memcpy(&env->fake_reg[1], &env->fake_reg[0],
- sizeof(env->fake_reg[0]));
- err = reg_set_min_max(env,
- &other_branch_regs[insn->dst_reg],
- &env->fake_reg[0],
- dst_reg, &env->fake_reg[1],
- opcode, is_jmp32);
- }
+ err = regs_bounds_sanity_check_branches(env);
if (err)
return err;
+ copy_register_state(dst_reg, &env->false_reg1);
+ copy_register_state(src_reg, &env->false_reg2);
+ copy_register_state(&other_branch_regs[insn->dst_reg], &env->true_reg1);
+ if (BPF_SRC(insn->code) == BPF_X)
+ copy_register_state(&other_branch_regs[insn->src_reg], &env->true_reg2);
+
if (BPF_SRC(insn->code) == BPF_X &&
src_reg->type == SCALAR_VALUE && src_reg->id &&
!WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
@@ -17713,12 +16484,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
/* detect if R == 0 where R is returned from bpf_map_lookup_elem().
+ * Also does the same detection for a register whose the value is
+ * known to be 0.
* NOTE: these optimizations below are related with pointer comparison
* which will never be JMP32.
*/
- if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
- insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
- type_may_be_null(dst_reg->type)) {
+ if (!is_jmp32 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+ type_may_be_null(dst_reg->type) &&
+ ((BPF_SRC(insn->code) == BPF_K && insn->imm == 0) ||
+ (BPF_SRC(insn->code) == BPF_X && bpf_register_is_null(src_reg)))) {
/* Mark all identical registers in each branch as either
* safe or unknown depending R == 0 or R != 0 conditional.
*/
@@ -17751,10 +16525,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
verbose(env, "invalid BPF_LD_IMM insn\n");
return -EINVAL;
}
- if (insn->off != 0) {
- verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
- return -EINVAL;
- }
err = check_reg_arg(env, insn->dst_reg, DST_OP);
if (err)
@@ -17794,8 +16564,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (insn->src_reg == BPF_PSEUDO_FUNC) {
struct bpf_prog_aux *aux = env->prog->aux;
- u32 subprogno = find_subprog(env,
- env->insn_idx + insn->imm + 1);
+ u32 subprogno = bpf_find_subprog(env,
+ env->insn_idx + insn->imm + 1);
if (!aux->func_info) {
verbose(env, "missing btf func_info\n");
@@ -17812,22 +16582,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
map = env->used_maps[aux->map_index];
- dst_reg->map_ptr = map;
if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
if (map->map_type == BPF_MAP_TYPE_ARENA) {
__mark_reg_unknown(env, dst_reg);
+ dst_reg->map_ptr = map;
return 0;
}
+ __mark_reg_known(dst_reg, aux->map_off);
dst_reg->type = PTR_TO_MAP_VALUE;
- dst_reg->off = aux->map_off;
+ dst_reg->map_ptr = map;
WARN_ON_ONCE(map->map_type != BPF_MAP_TYPE_INSN_ARRAY &&
map->max_entries != 1);
/* We want reg->id to be same (0) as map_value is not distinct */
} else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
insn->src_reg == BPF_PSEUDO_MAP_IDX) {
dst_reg->type = CONST_PTR_TO_MAP;
+ dst_reg->map_ptr = map;
} else {
verifier_bug(env, "unexpected src reg value for ldimm64");
return -EFAULT;
@@ -17880,13 +16652,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EFAULT;
}
- if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
- BPF_SIZE(insn->code) == BPF_DW ||
- (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
- verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
- return -EINVAL;
- }
-
/* check whether implicit source operand (register R6) is readable */
err = check_reg_arg(env, ctx_reg, SRC_OP);
if (err)
@@ -17919,7 +16684,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
/* reset caller saved regs to unreadable */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(env, regs, caller_saved[i]);
+ bpf_mark_reg_not_init(env, &regs[caller_saved[i]]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
@@ -17930,107 +16695,59 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
mark_reg_unknown(env, regs, BPF_REG_0);
/* ld_abs load up to 32-bit skb data. */
regs[BPF_REG_0].subreg_def = env->insn_idx + 1;
+ /*
+ * See bpf_gen_ld_abs() which emits a hidden BPF_EXIT with r0=0
+ * which must be explored by the verifier when in a subprog.
+ */
+ if (env->cur_state->curframe) {
+ struct bpf_verifier_state *branch;
+
+ mark_reg_scratched(env, BPF_REG_0);
+ branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
+ if (IS_ERR(branch))
+ return PTR_ERR(branch);
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ err = prepare_func_exit(env, &env->insn_idx);
+ if (err)
+ return err;
+ env->insn_idx--;
+ }
return 0;
}
-static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name)
+
+static bool return_retval_range(struct bpf_verifier_env *env, struct bpf_retval_range *range)
{
- const char *exit_ctx = "At program exit";
- struct tnum enforce_attach_type_range = tnum_unknown;
- const struct bpf_prog *prog = env->prog;
- struct bpf_reg_state *reg = reg_state(env, regno);
- struct bpf_retval_range range = retval_range(0, 1);
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
- int err;
- struct bpf_func_state *frame = env->cur_state->frame[0];
- const bool is_subprog = frame->subprogno;
- bool return_32bit = false;
- const struct btf_type *reg_type, *ret_type = NULL;
- /* LSM and struct_ops func-ptr's return type could be "void" */
- if (!is_subprog || frame->in_exception_callback_fn) {
- switch (prog_type) {
- case BPF_PROG_TYPE_LSM:
- if (prog->expected_attach_type == BPF_LSM_CGROUP)
- /* See below, can be 0 or 0-1 depending on hook. */
- break;
- if (!prog->aux->attach_func_proto->type)
- return 0;
- break;
- case BPF_PROG_TYPE_STRUCT_OPS:
- if (!prog->aux->attach_func_proto->type)
- return 0;
-
- if (frame->in_exception_callback_fn)
- break;
+ /* Default return value range. */
+ *range = retval_range(0, 1);
- /* Allow a struct_ops program to return a referenced kptr if it
- * matches the operator's return type and is in its unmodified
- * form. A scalar zero (i.e., a null pointer) is also allowed.
- */
- reg_type = reg->btf ? btf_type_by_id(reg->btf, reg->btf_id) : NULL;
- ret_type = btf_type_resolve_ptr(prog->aux->attach_btf,
- prog->aux->attach_func_proto->type,
- NULL);
- if (ret_type && ret_type == reg_type && reg->ref_obj_id)
- return __check_ptr_off_reg(env, reg, regno, false);
+ switch (prog_type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ switch (env->prog->expected_attach_type) {
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
+ *range = retval_range(1, 1);
+ break;
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ *range = retval_range(0, 3);
break;
default:
break;
}
- }
-
- /* eBPF calling convention is such that R0 is used
- * to return the value from eBPF program.
- * Make sure that it's readable at this time
- * of bpf_exit, which means that program wrote
- * something into it earlier
- */
- err = check_reg_arg(env, regno, SRC_OP);
- if (err)
- return err;
-
- if (is_pointer_value(env, regno)) {
- verbose(env, "R%d leaks addr as return value\n", regno);
- return -EACCES;
- }
-
- if (frame->in_async_callback_fn) {
- exit_ctx = "At async callback return";
- range = frame->callback_ret_range;
- goto enforce_retval;
- }
-
- if (is_subprog && !frame->in_exception_callback_fn) {
- if (reg->type != SCALAR_VALUE) {
- verbose(env, "At subprogram exit the register R%d is not a scalar value (%s)\n",
- regno, reg_type_str(env, reg->type));
- return -EINVAL;
- }
- return 0;
- }
-
- switch (prog_type) {
- case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
- if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
- env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
- env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG ||
- env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
- env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
- env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME ||
- env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
- env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME ||
- env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME)
- range = retval_range(1, 1);
- if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
- env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
- range = retval_range(0, 3);
break;
case BPF_PROG_TYPE_CGROUP_SKB:
- if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
- range = retval_range(0, 3);
- enforce_attach_type_range = tnum_range(2, 3);
- }
+ if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS)
+ *range = retval_range(0, 3);
break;
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_SOCK_OPS:
@@ -18040,72 +16757,164 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
break;
case BPF_PROG_TYPE_RAW_TRACEPOINT:
if (!env->prog->aux->attach_btf_id)
- return 0;
- range = retval_range(0, 0);
+ return false;
+ *range = retval_range(0, 0);
break;
case BPF_PROG_TYPE_TRACING:
switch (env->prog->expected_attach_type) {
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
case BPF_TRACE_FSESSION:
- range = retval_range(0, 0);
+ *range = retval_range(0, 0);
break;
case BPF_TRACE_RAW_TP:
case BPF_MODIFY_RETURN:
- return 0;
+ return false;
case BPF_TRACE_ITER:
- break;
default:
- return -ENOTSUPP;
+ break;
}
break;
case BPF_PROG_TYPE_KPROBE:
switch (env->prog->expected_attach_type) {
case BPF_TRACE_KPROBE_SESSION:
case BPF_TRACE_UPROBE_SESSION:
- range = retval_range(0, 1);
break;
default:
- return 0;
+ return false;
}
break;
case BPF_PROG_TYPE_SK_LOOKUP:
- range = retval_range(SK_DROP, SK_PASS);
+ *range = retval_range(SK_DROP, SK_PASS);
break;
case BPF_PROG_TYPE_LSM:
if (env->prog->expected_attach_type != BPF_LSM_CGROUP) {
/* no range found, any return value is allowed */
- if (!get_func_retval_range(env->prog, &range))
- return 0;
+ if (!get_func_retval_range(env->prog, range))
+ return false;
/* no restricted range, any return value is allowed */
- if (range.minval == S32_MIN && range.maxval == S32_MAX)
- return 0;
- return_32bit = true;
+ if (range->minval == S32_MIN && range->maxval == S32_MAX)
+ return false;
+ range->return_32bit = true;
} else if (!env->prog->aux->attach_func_proto->type) {
/* Make sure programs that attach to void
* hooks don't try to modify return value.
*/
- range = retval_range(1, 1);
+ *range = retval_range(1, 1);
}
break;
case BPF_PROG_TYPE_NETFILTER:
- range = retval_range(NF_DROP, NF_ACCEPT);
+ *range = retval_range(NF_DROP, NF_ACCEPT);
break;
case BPF_PROG_TYPE_STRUCT_OPS:
- if (!ret_type)
- return 0;
- range = retval_range(0, 0);
+ *range = retval_range(0, 0);
break;
case BPF_PROG_TYPE_EXT:
/* freplace program can return anything as its return value
* depends on the to-be-replaced kernel func or bpf program.
*/
default:
+ return false;
+ }
+
+ /* Continue calculating. */
+
+ return true;
+}
+
+static bool program_returns_void(struct bpf_verifier_env *env)
+{
+ const struct bpf_prog *prog = env->prog;
+ enum bpf_prog_type prog_type = prog->type;
+
+ switch (prog_type) {
+ case BPF_PROG_TYPE_LSM:
+ /* See return_retval_range, for BPF_LSM_CGROUP can be 0 or 0-1 depending on hook. */
+ if (prog->expected_attach_type != BPF_LSM_CGROUP &&
+ !prog->aux->attach_func_proto->type)
+ return true;
+ break;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (!prog->aux->attach_func_proto->type)
+ return true;
+ break;
+ case BPF_PROG_TYPE_EXT:
+ /*
+ * If the actual program is an extension, let it
+ * return void - attaching will succeed only if the
+ * program being replaced also returns void, and since
+ * it has passed verification its actual type doesn't matter.
+ */
+ if (subprog_returns_void(env, 0))
+ return true;
+ break;
+ default:
+ break;
+ }
+ return false;
+}
+
+static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name)
+{
+ const char *exit_ctx = "At program exit";
+ struct tnum enforce_attach_type_range = tnum_unknown;
+ const struct bpf_prog *prog = env->prog;
+ struct bpf_reg_state *reg = reg_state(env, regno);
+ struct bpf_retval_range range = retval_range(0, 1);
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+ struct bpf_func_state *frame = env->cur_state->frame[0];
+ const struct btf_type *reg_type, *ret_type = NULL;
+ int err;
+
+ /* LSM and struct_ops func-ptr's return type could be "void" */
+ if (!frame->in_async_callback_fn && program_returns_void(env))
return 0;
+
+ if (prog_type == BPF_PROG_TYPE_STRUCT_OPS) {
+ /* Allow a struct_ops program to return a referenced kptr if it
+ * matches the operator's return type and is in its unmodified
+ * form. A scalar zero (i.e., a null pointer) is also allowed.
+ */
+ reg_type = reg->btf ? btf_type_by_id(reg->btf, reg->btf_id) : NULL;
+ ret_type = btf_type_resolve_ptr(prog->aux->attach_btf,
+ prog->aux->attach_func_proto->type,
+ NULL);
+ if (ret_type && ret_type == reg_type && reg->ref_obj_id)
+ return __check_ptr_off_reg(env, reg, regno, false);
+ }
+
+ /* eBPF calling convention is such that R0 is used
+ * to return the value from eBPF program.
+ * Make sure that it's readable at this time
+ * of bpf_exit, which means that program wrote
+ * something into it earlier
+ */
+ err = check_reg_arg(env, regno, SRC_OP);
+ if (err)
+ return err;
+
+ if (is_pointer_value(env, regno)) {
+ verbose(env, "R%d leaks addr as return value\n", regno);
+ return -EACCES;
}
+ if (frame->in_async_callback_fn) {
+ exit_ctx = "At async callback return";
+ range = frame->callback_ret_range;
+ goto enforce_retval;
+ }
+
+ if (prog_type == BPF_PROG_TYPE_STRUCT_OPS && !ret_type)
+ return 0;
+
+ if (prog_type == BPF_PROG_TYPE_CGROUP_SKB && (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS))
+ enforce_attach_type_range = tnum_range(2, 3);
+
+ if (!return_retval_range(env, &range))
+ return 0;
+
enforce_retval:
if (reg->type != SCALAR_VALUE) {
verbose(env, "%s the register R%d is not a known value (%s)\n",
@@ -18117,10 +16926,9 @@ enforce_retval:
if (err)
return err;
- if (!retval_range_within(range, reg, return_32bit)) {
+ if (!retval_range_within(range, reg)) {
verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name);
- if (!is_subprog &&
- prog->expected_attach_type == BPF_LSM_CGROUP &&
+ if (prog->expected_attach_type == BPF_LSM_CGROUP &&
prog_type == BPF_PROG_TYPE_LSM &&
!prog->aux->attach_func_proto->type)
verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
@@ -18133,189 +16941,31 @@ enforce_retval:
return 0;
}
-static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off)
-{
- struct bpf_subprog_info *subprog;
-
- subprog = bpf_find_containing_subprog(env, off);
- subprog->changes_pkt_data = true;
-}
-
-static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off)
-{
- struct bpf_subprog_info *subprog;
-
- subprog = bpf_find_containing_subprog(env, off);
- subprog->might_sleep = true;
-}
-
-/* 't' is an index of a call-site.
- * 'w' is a callee entry point.
- * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED.
- * Rely on DFS traversal order and absence of recursive calls to guarantee that
- * callee's change_pkt_data marks would be correct at that moment.
- */
-static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w)
-{
- struct bpf_subprog_info *caller, *callee;
-
- caller = bpf_find_containing_subprog(env, t);
- callee = bpf_find_containing_subprog(env, w);
- caller->changes_pkt_data |= callee->changes_pkt_data;
- caller->might_sleep |= callee->might_sleep;
-}
-
-/* non-recursive DFS pseudo code
- * 1 procedure DFS-iterative(G,v):
- * 2 label v as discovered
- * 3 let S be a stack
- * 4 S.push(v)
- * 5 while S is not empty
- * 6 t <- S.peek()
- * 7 if t is what we're looking for:
- * 8 return t
- * 9 for all edges e in G.adjacentEdges(t) do
- * 10 if edge e is already labelled
- * 11 continue with the next edge
- * 12 w <- G.adjacentVertex(t,e)
- * 13 if vertex w is not discovered and not explored
- * 14 label e as tree-edge
- * 15 label w as discovered
- * 16 S.push(w)
- * 17 continue at 5
- * 18 else if vertex w is discovered
- * 19 label e as back-edge
- * 20 else
- * 21 // vertex w is explored
- * 22 label e as forward- or cross-edge
- * 23 label t as explored
- * 24 S.pop()
- *
- * convention:
- * 0x10 - discovered
- * 0x11 - discovered and fall-through edge labelled
- * 0x12 - discovered and fall-through and branch edges labelled
- * 0x20 - explored
- */
-
-enum {
- DISCOVERED = 0x10,
- EXPLORED = 0x20,
- FALLTHROUGH = 1,
- BRANCH = 2,
-};
-
-static void mark_prune_point(struct bpf_verifier_env *env, int idx)
-{
- env->insn_aux_data[idx].prune_point = true;
-}
-
-static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
-{
- return env->insn_aux_data[insn_idx].prune_point;
-}
-
-static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx)
-{
- env->insn_aux_data[idx].force_checkpoint = true;
-}
-
-static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
-{
- return env->insn_aux_data[insn_idx].force_checkpoint;
-}
-
-static void mark_calls_callback(struct bpf_verifier_env *env, int idx)
-{
- env->insn_aux_data[idx].calls_callback = true;
-}
-
-bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx)
-{
- return env->insn_aux_data[insn_idx].calls_callback;
-}
-
-enum {
- DONE_EXPLORING = 0,
- KEEP_EXPLORING = 1,
-};
-
-/* t, w, e - match pseudo-code above:
- * t - index of current instruction
- * w - next instruction
- * e - edge
- */
-static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
+static int check_global_subprog_return_code(struct bpf_verifier_env *env)
{
- int *insn_stack = env->cfg.insn_stack;
- int *insn_state = env->cfg.insn_state;
-
- if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
- return DONE_EXPLORING;
+ struct bpf_reg_state *reg = reg_state(env, BPF_REG_0);
+ struct bpf_func_state *cur_frame = cur_func(env);
+ int err;
- if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
- return DONE_EXPLORING;
+ if (subprog_returns_void(env, cur_frame->subprogno))
+ return 0;
- if (w < 0 || w >= env->prog->len) {
- verbose_linfo(env, t, "%d: ", t);
- verbose(env, "jump out of range from insn %d to %d\n", t, w);
- return -EINVAL;
- }
+ err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+ if (err)
+ return err;
- if (e == BRANCH) {
- /* mark branch target for state pruning */
- mark_prune_point(env, w);
- mark_jmp_point(env, w);
+ if (is_pointer_value(env, BPF_REG_0)) {
+ verbose(env, "R%d leaks addr as return value\n", BPF_REG_0);
+ return -EACCES;
}
- if (insn_state[w] == 0) {
- /* tree-edge */
- insn_state[t] = DISCOVERED | e;
- insn_state[w] = DISCOVERED;
- if (env->cfg.cur_stack >= env->prog->len)
- return -E2BIG;
- insn_stack[env->cfg.cur_stack++] = w;
- return KEEP_EXPLORING;
- } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
- if (env->bpf_capable)
- return DONE_EXPLORING;
- verbose_linfo(env, t, "%d: ", t);
- verbose_linfo(env, w, "%d: ", w);
- verbose(env, "back-edge from insn %d to %d\n", t, w);
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
+ reg_type_str(env, reg->type));
return -EINVAL;
- } else if (insn_state[w] == EXPLORED) {
- /* forward- or cross-edge */
- insn_state[t] = DISCOVERED | e;
- } else {
- verifier_bug(env, "insn state internal bug");
- return -EFAULT;
}
- return DONE_EXPLORING;
-}
-static int visit_func_call_insn(int t, struct bpf_insn *insns,
- struct bpf_verifier_env *env,
- bool visit_callee)
-{
- int ret, insn_sz;
- int w;
-
- insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
- ret = push_insn(t, t + insn_sz, FALLTHROUGH, env);
- if (ret)
- return ret;
-
- mark_prune_point(env, t + insn_sz);
- /* when we exit from subprog, we need to record non-linear history */
- mark_jmp_point(env, t + insn_sz);
-
- if (visit_callee) {
- w = t + insns[t].imm + 1;
- mark_prune_point(env, t);
- merge_callee_effects(env, t, w);
- ret = push_insn(t, w, BRANCH, env);
- }
- return ret;
+ return 0;
}
/* Bitmask with 1s for all caller saved registers */
@@ -18325,7 +16975,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
* replacement patch is presumed to follow bpf_fastcall contract
* (see mark_fastcall_pattern_for_call() below).
*/
-static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm)
+bool bpf_verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm)
{
switch (imm) {
#ifdef CONFIG_X86_64
@@ -18341,17 +16991,11 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm)
}
}
-struct call_summary {
- u8 num_params;
- bool is_void;
- bool fastcall;
-};
-
/* If @call is a kfunc or helper call, fills @cs and returns true,
* otherwise returns false.
*/
-static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call,
- struct call_summary *cs)
+bool bpf_get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call,
+ struct bpf_call_summary *cs)
{
struct bpf_kfunc_call_arg_meta meta;
const struct bpf_func_proto *fn;
@@ -18359,11 +17003,11 @@ static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call
if (bpf_helper_call(call)) {
- if (get_helper_proto(env, call->imm, &fn) < 0)
+ if (bpf_get_helper_proto(env, call->imm, &fn) < 0)
/* error would be reported later */
return false;
cs->fastcall = fn->allow_fastcall &&
- (verifier_inlines_helper_call(env, call->imm) ||
+ (bpf_verifier_inlines_helper_call(env, call->imm) ||
bpf_jit_inlines_helper_call(call->imm));
cs->is_void = fn->ret_type == RET_VOID;
cs->num_params = 0;
@@ -18378,7 +17022,7 @@ static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call
if (bpf_pseudo_kfunc_call(call)) {
int err;
- err = fetch_kfunc_arg_meta(env, call->imm, call->off, &meta);
+ err = bpf_fetch_kfunc_arg_meta(env, call->imm, call->off, &meta);
if (err < 0)
/* error would be reported later */
return false;
@@ -18472,12 +17116,12 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env,
struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx;
struct bpf_insn *call = &env->prog->insnsi[insn_idx];
u32 clobbered_regs_mask;
- struct call_summary cs;
+ struct bpf_call_summary cs;
u32 expected_regs_mask;
s16 off;
int i;
- if (!get_call_summary(env, call, &cs))
+ if (!bpf_get_call_summary(env, call, &cs))
return;
/* A bitmask specifying which caller saved registers are clobbered
@@ -18580,714 +17224,6 @@ static int mark_fastcall_patterns(struct bpf_verifier_env *env)
return 0;
}
-static struct bpf_iarray *iarray_realloc(struct bpf_iarray *old, size_t n_elem)
-{
- size_t new_size = sizeof(struct bpf_iarray) + n_elem * sizeof(old->items[0]);
- struct bpf_iarray *new;
-
- new = kvrealloc(old, new_size, GFP_KERNEL_ACCOUNT);
- if (!new) {
- /* this is what callers always want, so simplify the call site */
- kvfree(old);
- return NULL;
- }
-
- new->cnt = n_elem;
- return new;
-}
-
-static int copy_insn_array(struct bpf_map *map, u32 start, u32 end, u32 *items)
-{
- struct bpf_insn_array_value *value;
- u32 i;
-
- for (i = start; i <= end; i++) {
- value = map->ops->map_lookup_elem(map, &i);
- /*
- * map_lookup_elem of an array map will never return an error,
- * but not checking it makes some static analysers to worry
- */
- if (IS_ERR(value))
- return PTR_ERR(value);
- else if (!value)
- return -EINVAL;
- items[i - start] = value->xlated_off;
- }
- return 0;
-}
-
-static int cmp_ptr_to_u32(const void *a, const void *b)
-{
- return *(u32 *)a - *(u32 *)b;
-}
-
-static int sort_insn_array_uniq(u32 *items, int cnt)
-{
- int unique = 1;
- int i;
-
- sort(items, cnt, sizeof(items[0]), cmp_ptr_to_u32, NULL);
-
- for (i = 1; i < cnt; i++)
- if (items[i] != items[unique - 1])
- items[unique++] = items[i];
-
- return unique;
-}
-
-/*
- * sort_unique({map[start], ..., map[end]}) into off
- */
-static int copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off)
-{
- u32 n = end - start + 1;
- int err;
-
- err = copy_insn_array(map, start, end, off);
- if (err)
- return err;
-
- return sort_insn_array_uniq(off, n);
-}
-
-/*
- * Copy all unique offsets from the map
- */
-static struct bpf_iarray *jt_from_map(struct bpf_map *map)
-{
- struct bpf_iarray *jt;
- int err;
- int n;
-
- jt = iarray_realloc(NULL, map->max_entries);
- if (!jt)
- return ERR_PTR(-ENOMEM);
-
- n = copy_insn_array_uniq(map, 0, map->max_entries - 1, jt->items);
- if (n < 0) {
- err = n;
- goto err_free;
- }
- if (n == 0) {
- err = -EINVAL;
- goto err_free;
- }
- jt->cnt = n;
- return jt;
-
-err_free:
- kvfree(jt);
- return ERR_PTR(err);
-}
-
-/*
- * Find and collect all maps which fit in the subprog. Return the result as one
- * combined jump table in jt->items (allocated with kvcalloc)
- */
-static struct bpf_iarray *jt_from_subprog(struct bpf_verifier_env *env,
- int subprog_start, int subprog_end)
-{
- struct bpf_iarray *jt = NULL;
- struct bpf_map *map;
- struct bpf_iarray *jt_cur;
- int i;
-
- for (i = 0; i < env->insn_array_map_cnt; i++) {
- /*
- * TODO (when needed): collect only jump tables, not static keys
- * or maps for indirect calls
- */
- map = env->insn_array_maps[i];
-
- jt_cur = jt_from_map(map);
- if (IS_ERR(jt_cur)) {
- kvfree(jt);
- return jt_cur;
- }
-
- /*
- * This is enough to check one element. The full table is
- * checked to fit inside the subprog later in create_jt()
- */
- if (jt_cur->items[0] >= subprog_start && jt_cur->items[0] < subprog_end) {
- u32 old_cnt = jt ? jt->cnt : 0;
- jt = iarray_realloc(jt, old_cnt + jt_cur->cnt);
- if (!jt) {
- kvfree(jt_cur);
- return ERR_PTR(-ENOMEM);
- }
- memcpy(jt->items + old_cnt, jt_cur->items, jt_cur->cnt << 2);
- }
-
- kvfree(jt_cur);
- }
-
- if (!jt) {
- verbose(env, "no jump tables found for subprog starting at %u\n", subprog_start);
- return ERR_PTR(-EINVAL);
- }
-
- jt->cnt = sort_insn_array_uniq(jt->items, jt->cnt);
- return jt;
-}
-
-static struct bpf_iarray *
-create_jt(int t, struct bpf_verifier_env *env)
-{
- static struct bpf_subprog_info *subprog;
- int subprog_start, subprog_end;
- struct bpf_iarray *jt;
- int i;
-
- subprog = bpf_find_containing_subprog(env, t);
- subprog_start = subprog->start;
- subprog_end = (subprog + 1)->start;
- jt = jt_from_subprog(env, subprog_start, subprog_end);
- if (IS_ERR(jt))
- return jt;
-
- /* Check that the every element of the jump table fits within the given subprogram */
- for (i = 0; i < jt->cnt; i++) {
- if (jt->items[i] < subprog_start || jt->items[i] >= subprog_end) {
- verbose(env, "jump table for insn %d points outside of the subprog [%u,%u]\n",
- t, subprog_start, subprog_end);
- kvfree(jt);
- return ERR_PTR(-EINVAL);
- }
- }
-
- return jt;
-}
-
-/* "conditional jump with N edges" */
-static int visit_gotox_insn(int t, struct bpf_verifier_env *env)
-{
- int *insn_stack = env->cfg.insn_stack;
- int *insn_state = env->cfg.insn_state;
- bool keep_exploring = false;
- struct bpf_iarray *jt;
- int i, w;
-
- jt = env->insn_aux_data[t].jt;
- if (!jt) {
- jt = create_jt(t, env);
- if (IS_ERR(jt))
- return PTR_ERR(jt);
-
- env->insn_aux_data[t].jt = jt;
- }
-
- mark_prune_point(env, t);
- for (i = 0; i < jt->cnt; i++) {
- w = jt->items[i];
- if (w < 0 || w >= env->prog->len) {
- verbose(env, "indirect jump out of range from insn %d to %d\n", t, w);
- return -EINVAL;
- }
-
- mark_jmp_point(env, w);
-
- /* EXPLORED || DISCOVERED */
- if (insn_state[w])
- continue;
-
- if (env->cfg.cur_stack >= env->prog->len)
- return -E2BIG;
-
- insn_stack[env->cfg.cur_stack++] = w;
- insn_state[w] |= DISCOVERED;
- keep_exploring = true;
- }
-
- return keep_exploring ? KEEP_EXPLORING : DONE_EXPLORING;
-}
-
-static int visit_tailcall_insn(struct bpf_verifier_env *env, int t)
-{
- static struct bpf_subprog_info *subprog;
- struct bpf_iarray *jt;
-
- if (env->insn_aux_data[t].jt)
- return 0;
-
- jt = iarray_realloc(NULL, 2);
- if (!jt)
- return -ENOMEM;
-
- subprog = bpf_find_containing_subprog(env, t);
- jt->items[0] = t + 1;
- jt->items[1] = subprog->exit_idx;
- env->insn_aux_data[t].jt = jt;
- return 0;
-}
-
-/* Visits the instruction at index t and returns one of the following:
- * < 0 - an error occurred
- * DONE_EXPLORING - the instruction was fully explored
- * KEEP_EXPLORING - there is still work to be done before it is fully explored
- */
-static int visit_insn(int t, struct bpf_verifier_env *env)
-{
- struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
- int ret, off, insn_sz;
-
- if (bpf_pseudo_func(insn))
- return visit_func_call_insn(t, insns, env, true);
-
- /* All non-branch instructions have a single fall-through edge. */
- if (BPF_CLASS(insn->code) != BPF_JMP &&
- BPF_CLASS(insn->code) != BPF_JMP32) {
- insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
- return push_insn(t, t + insn_sz, FALLTHROUGH, env);
- }
-
- switch (BPF_OP(insn->code)) {
- case BPF_EXIT:
- return DONE_EXPLORING;
-
- case BPF_CALL:
- if (is_async_callback_calling_insn(insn))
- /* Mark this call insn as a prune point to trigger
- * is_state_visited() check before call itself is
- * processed by __check_func_call(). Otherwise new
- * async state will be pushed for further exploration.
- */
- mark_prune_point(env, t);
- /* For functions that invoke callbacks it is not known how many times
- * callback would be called. Verifier models callback calling functions
- * by repeatedly visiting callback bodies and returning to origin call
- * instruction.
- * In order to stop such iteration verifier needs to identify when a
- * state identical some state from a previous iteration is reached.
- * Check below forces creation of checkpoint before callback calling
- * instruction to allow search for such identical states.
- */
- if (is_sync_callback_calling_insn(insn)) {
- mark_calls_callback(env, t);
- mark_force_checkpoint(env, t);
- mark_prune_point(env, t);
- mark_jmp_point(env, t);
- }
- if (bpf_helper_call(insn)) {
- const struct bpf_func_proto *fp;
-
- ret = get_helper_proto(env, insn->imm, &fp);
- /* If called in a non-sleepable context program will be
- * rejected anyway, so we should end up with precise
- * sleepable marks on subprogs, except for dead code
- * elimination.
- */
- if (ret == 0 && fp->might_sleep)
- mark_subprog_might_sleep(env, t);
- if (bpf_helper_changes_pkt_data(insn->imm))
- mark_subprog_changes_pkt_data(env, t);
- if (insn->imm == BPF_FUNC_tail_call)
- visit_tailcall_insn(env, t);
- } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
- struct bpf_kfunc_call_arg_meta meta;
-
- ret = fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta);
- if (ret == 0 && is_iter_next_kfunc(&meta)) {
- mark_prune_point(env, t);
- /* Checking and saving state checkpoints at iter_next() call
- * is crucial for fast convergence of open-coded iterator loop
- * logic, so we need to force it. If we don't do that,
- * is_state_visited() might skip saving a checkpoint, causing
- * unnecessarily long sequence of not checkpointed
- * instructions and jumps, leading to exhaustion of jump
- * history buffer, and potentially other undesired outcomes.
- * It is expected that with correct open-coded iterators
- * convergence will happen quickly, so we don't run a risk of
- * exhausting memory.
- */
- mark_force_checkpoint(env, t);
- }
- /* Same as helpers, if called in a non-sleepable context
- * program will be rejected anyway, so we should end up
- * with precise sleepable marks on subprogs, except for
- * dead code elimination.
- */
- if (ret == 0 && is_kfunc_sleepable(&meta))
- mark_subprog_might_sleep(env, t);
- if (ret == 0 && is_kfunc_pkt_changing(&meta))
- mark_subprog_changes_pkt_data(env, t);
- }
- return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
-
- case BPF_JA:
- if (BPF_SRC(insn->code) == BPF_X)
- return visit_gotox_insn(t, env);
-
- if (BPF_CLASS(insn->code) == BPF_JMP)
- off = insn->off;
- else
- off = insn->imm;
-
- /* unconditional jump with single edge */
- ret = push_insn(t, t + off + 1, FALLTHROUGH, env);
- if (ret)
- return ret;
-
- mark_prune_point(env, t + off + 1);
- mark_jmp_point(env, t + off + 1);
-
- return ret;
-
- default:
- /* conditional jump with two edges */
- mark_prune_point(env, t);
- if (is_may_goto_insn(insn))
- mark_force_checkpoint(env, t);
-
- ret = push_insn(t, t + 1, FALLTHROUGH, env);
- if (ret)
- return ret;
-
- return push_insn(t, t + insn->off + 1, BRANCH, env);
- }
-}
-
-/* non-recursive depth-first-search to detect loops in BPF program
- * loop == back-edge in directed graph
- */
-static int check_cfg(struct bpf_verifier_env *env)
-{
- int insn_cnt = env->prog->len;
- int *insn_stack, *insn_state;
- int ex_insn_beg, i, ret = 0;
-
- insn_state = env->cfg.insn_state = kvzalloc_objs(int, insn_cnt,
- GFP_KERNEL_ACCOUNT);
- if (!insn_state)
- return -ENOMEM;
-
- insn_stack = env->cfg.insn_stack = kvzalloc_objs(int, insn_cnt,
- GFP_KERNEL_ACCOUNT);
- if (!insn_stack) {
- kvfree(insn_state);
- return -ENOMEM;
- }
-
- ex_insn_beg = env->exception_callback_subprog
- ? env->subprog_info[env->exception_callback_subprog].start
- : 0;
-
- insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
- insn_stack[0] = 0; /* 0 is the first instruction */
- env->cfg.cur_stack = 1;
-
-walk_cfg:
- while (env->cfg.cur_stack > 0) {
- int t = insn_stack[env->cfg.cur_stack - 1];
-
- ret = visit_insn(t, env);
- switch (ret) {
- case DONE_EXPLORING:
- insn_state[t] = EXPLORED;
- env->cfg.cur_stack--;
- break;
- case KEEP_EXPLORING:
- break;
- default:
- if (ret > 0) {
- verifier_bug(env, "visit_insn internal bug");
- ret = -EFAULT;
- }
- goto err_free;
- }
- }
-
- if (env->cfg.cur_stack < 0) {
- verifier_bug(env, "pop stack internal bug");
- ret = -EFAULT;
- goto err_free;
- }
-
- if (ex_insn_beg && insn_state[ex_insn_beg] != EXPLORED) {
- insn_state[ex_insn_beg] = DISCOVERED;
- insn_stack[0] = ex_insn_beg;
- env->cfg.cur_stack = 1;
- goto walk_cfg;
- }
-
- for (i = 0; i < insn_cnt; i++) {
- struct bpf_insn *insn = &env->prog->insnsi[i];
-
- if (insn_state[i] != EXPLORED) {
- verbose(env, "unreachable insn %d\n", i);
- ret = -EINVAL;
- goto err_free;
- }
- if (bpf_is_ldimm64(insn)) {
- if (insn_state[i + 1] != 0) {
- verbose(env, "jump into the middle of ldimm64 insn %d\n", i);
- ret = -EINVAL;
- goto err_free;
- }
- i++; /* skip second half of ldimm64 */
- }
- }
- ret = 0; /* cfg looks good */
- env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data;
- env->prog->aux->might_sleep = env->subprog_info[0].might_sleep;
-
-err_free:
- kvfree(insn_state);
- kvfree(insn_stack);
- env->cfg.insn_state = env->cfg.insn_stack = NULL;
- return ret;
-}
-
-/*
- * For each subprogram 'i' fill array env->cfg.insn_subprogram sub-range
- * [env->subprog_info[i].postorder_start, env->subprog_info[i+1].postorder_start)
- * with indices of 'i' instructions in postorder.
- */
-static int compute_postorder(struct bpf_verifier_env *env)
-{
- u32 cur_postorder, i, top, stack_sz, s;
- int *stack = NULL, *postorder = NULL, *state = NULL;
- struct bpf_iarray *succ;
-
- postorder = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT);
- state = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT);
- stack = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT);
- if (!postorder || !state || !stack) {
- kvfree(postorder);
- kvfree(state);
- kvfree(stack);
- return -ENOMEM;
- }
- cur_postorder = 0;
- for (i = 0; i < env->subprog_cnt; i++) {
- env->subprog_info[i].postorder_start = cur_postorder;
- stack[0] = env->subprog_info[i].start;
- stack_sz = 1;
- do {
- top = stack[stack_sz - 1];
- state[top] |= DISCOVERED;
- if (state[top] & EXPLORED) {
- postorder[cur_postorder++] = top;
- stack_sz--;
- continue;
- }
- succ = bpf_insn_successors(env, top);
- for (s = 0; s < succ->cnt; ++s) {
- if (!state[succ->items[s]]) {
- stack[stack_sz++] = succ->items[s];
- state[succ->items[s]] |= DISCOVERED;
- }
- }
- state[top] |= EXPLORED;
- } while (stack_sz);
- }
- env->subprog_info[i].postorder_start = cur_postorder;
- env->cfg.insn_postorder = postorder;
- env->cfg.cur_postorder = cur_postorder;
- kvfree(stack);
- kvfree(state);
- return 0;
-}
-
-static int check_abnormal_return(struct bpf_verifier_env *env)
-{
- int i;
-
- for (i = 1; i < env->subprog_cnt; i++) {
- if (env->subprog_info[i].has_ld_abs) {
- verbose(env, "LD_ABS is not allowed in subprogs without BTF\n");
- return -EINVAL;
- }
- if (env->subprog_info[i].has_tail_call) {
- verbose(env, "tail_call is not allowed in subprogs without BTF\n");
- return -EINVAL;
- }
- }
- return 0;
-}
-
-/* The minimum supported BTF func info size */
-#define MIN_BPF_FUNCINFO_SIZE 8
-#define MAX_FUNCINFO_REC_SIZE 252
-
-static int check_btf_func_early(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- bpfptr_t uattr)
-{
- u32 krec_size = sizeof(struct bpf_func_info);
- const struct btf_type *type, *func_proto;
- u32 i, nfuncs, urec_size, min_size;
- struct bpf_func_info *krecord;
- struct bpf_prog *prog;
- const struct btf *btf;
- u32 prev_offset = 0;
- bpfptr_t urecord;
- int ret = -ENOMEM;
-
- nfuncs = attr->func_info_cnt;
- if (!nfuncs) {
- if (check_abnormal_return(env))
- return -EINVAL;
- return 0;
- }
-
- urec_size = attr->func_info_rec_size;
- if (urec_size < MIN_BPF_FUNCINFO_SIZE ||
- urec_size > MAX_FUNCINFO_REC_SIZE ||
- urec_size % sizeof(u32)) {
- verbose(env, "invalid func info rec size %u\n", urec_size);
- return -EINVAL;
- }
-
- prog = env->prog;
- btf = prog->aux->btf;
-
- urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
- min_size = min_t(u32, krec_size, urec_size);
-
- krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
- if (!krecord)
- return -ENOMEM;
-
- for (i = 0; i < nfuncs; i++) {
- ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
- if (ret) {
- if (ret == -E2BIG) {
- verbose(env, "nonzero tailing record in func info");
- /* set the size kernel expects so loader can zero
- * out the rest of the record.
- */
- if (copy_to_bpfptr_offset(uattr,
- offsetof(union bpf_attr, func_info_rec_size),
- &min_size, sizeof(min_size)))
- ret = -EFAULT;
- }
- goto err_free;
- }
-
- if (copy_from_bpfptr(&krecord[i], urecord, min_size)) {
- ret = -EFAULT;
- goto err_free;
- }
-
- /* check insn_off */
- ret = -EINVAL;
- if (i == 0) {
- if (krecord[i].insn_off) {
- verbose(env,
- "nonzero insn_off %u for the first func info record",
- krecord[i].insn_off);
- goto err_free;
- }
- } else if (krecord[i].insn_off <= prev_offset) {
- verbose(env,
- "same or smaller insn offset (%u) than previous func info record (%u)",
- krecord[i].insn_off, prev_offset);
- goto err_free;
- }
-
- /* check type_id */
- type = btf_type_by_id(btf, krecord[i].type_id);
- if (!type || !btf_type_is_func(type)) {
- verbose(env, "invalid type id %d in func info",
- krecord[i].type_id);
- goto err_free;
- }
-
- func_proto = btf_type_by_id(btf, type->type);
- if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto)))
- /* btf_func_check() already verified it during BTF load */
- goto err_free;
-
- prev_offset = krecord[i].insn_off;
- bpfptr_add(&urecord, urec_size);
- }
-
- prog->aux->func_info = krecord;
- prog->aux->func_info_cnt = nfuncs;
- return 0;
-
-err_free:
- kvfree(krecord);
- return ret;
-}
-
-static int check_btf_func(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- bpfptr_t uattr)
-{
- const struct btf_type *type, *func_proto, *ret_type;
- u32 i, nfuncs, urec_size;
- struct bpf_func_info *krecord;
- struct bpf_func_info_aux *info_aux = NULL;
- struct bpf_prog *prog;
- const struct btf *btf;
- bpfptr_t urecord;
- bool scalar_return;
- int ret = -ENOMEM;
-
- nfuncs = attr->func_info_cnt;
- if (!nfuncs) {
- if (check_abnormal_return(env))
- return -EINVAL;
- return 0;
- }
- if (nfuncs != env->subprog_cnt) {
- verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
- return -EINVAL;
- }
-
- urec_size = attr->func_info_rec_size;
-
- prog = env->prog;
- btf = prog->aux->btf;
-
- urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
-
- krecord = prog->aux->func_info;
- info_aux = kzalloc_objs(*info_aux, nfuncs,
- GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
- if (!info_aux)
- return -ENOMEM;
-
- for (i = 0; i < nfuncs; i++) {
- /* check insn_off */
- ret = -EINVAL;
-
- if (env->subprog_info[i].start != krecord[i].insn_off) {
- verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
- goto err_free;
- }
-
- /* Already checked type_id */
- type = btf_type_by_id(btf, krecord[i].type_id);
- info_aux[i].linkage = BTF_INFO_VLEN(type->info);
- /* Already checked func_proto */
- func_proto = btf_type_by_id(btf, type->type);
-
- ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
- scalar_return =
- btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type);
- if (i && !scalar_return && env->subprog_info[i].has_ld_abs) {
- verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n");
- goto err_free;
- }
- if (i && !scalar_return && env->subprog_info[i].has_tail_call) {
- verbose(env, "tail_call is only allowed in functions that return 'int'.\n");
- goto err_free;
- }
-
- bpfptr_add(&urecord, urec_size);
- }
-
- prog->aux->func_info_aux = info_aux;
- return 0;
-
-err_free:
- kfree(info_aux);
- return ret;
-}
-
static void adjust_btf_func(struct bpf_verifier_env *env)
{
struct bpf_prog_aux *aux = env->prog->aux;
@@ -19301,414 +17237,6 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
aux->func_info[i].insn_off = env->subprog_info[i].start;
}
-#define MIN_BPF_LINEINFO_SIZE offsetofend(struct bpf_line_info, line_col)
-#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE
-
-static int check_btf_line(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- bpfptr_t uattr)
-{
- u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
- struct bpf_subprog_info *sub;
- struct bpf_line_info *linfo;
- struct bpf_prog *prog;
- const struct btf *btf;
- bpfptr_t ulinfo;
- int err;
-
- nr_linfo = attr->line_info_cnt;
- if (!nr_linfo)
- return 0;
- if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info))
- return -EINVAL;
-
- rec_size = attr->line_info_rec_size;
- if (rec_size < MIN_BPF_LINEINFO_SIZE ||
- rec_size > MAX_LINEINFO_REC_SIZE ||
- rec_size & (sizeof(u32) - 1))
- return -EINVAL;
-
- /* Need to zero it in case the userspace may
- * pass in a smaller bpf_line_info object.
- */
- linfo = kvzalloc_objs(struct bpf_line_info, nr_linfo,
- GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
- if (!linfo)
- return -ENOMEM;
-
- prog = env->prog;
- btf = prog->aux->btf;
-
- s = 0;
- sub = env->subprog_info;
- ulinfo = make_bpfptr(attr->line_info, uattr.is_kernel);
- expected_size = sizeof(struct bpf_line_info);
- ncopy = min_t(u32, expected_size, rec_size);
- for (i = 0; i < nr_linfo; i++) {
- err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size);
- if (err) {
- if (err == -E2BIG) {
- verbose(env, "nonzero tailing record in line_info");
- if (copy_to_bpfptr_offset(uattr,
- offsetof(union bpf_attr, line_info_rec_size),
- &expected_size, sizeof(expected_size)))
- err = -EFAULT;
- }
- goto err_free;
- }
-
- if (copy_from_bpfptr(&linfo[i], ulinfo, ncopy)) {
- err = -EFAULT;
- goto err_free;
- }
-
- /*
- * Check insn_off to ensure
- * 1) strictly increasing AND
- * 2) bounded by prog->len
- *
- * The linfo[0].insn_off == 0 check logically falls into
- * the later "missing bpf_line_info for func..." case
- * because the first linfo[0].insn_off must be the
- * first sub also and the first sub must have
- * subprog_info[0].start == 0.
- */
- if ((i && linfo[i].insn_off <= prev_offset) ||
- linfo[i].insn_off >= prog->len) {
- verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n",
- i, linfo[i].insn_off, prev_offset,
- prog->len);
- err = -EINVAL;
- goto err_free;
- }
-
- if (!prog->insnsi[linfo[i].insn_off].code) {
- verbose(env,
- "Invalid insn code at line_info[%u].insn_off\n",
- i);
- err = -EINVAL;
- goto err_free;
- }
-
- if (!btf_name_by_offset(btf, linfo[i].line_off) ||
- !btf_name_by_offset(btf, linfo[i].file_name_off)) {
- verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i);
- err = -EINVAL;
- goto err_free;
- }
-
- if (s != env->subprog_cnt) {
- if (linfo[i].insn_off == sub[s].start) {
- sub[s].linfo_idx = i;
- s++;
- } else if (sub[s].start < linfo[i].insn_off) {
- verbose(env, "missing bpf_line_info for func#%u\n", s);
- err = -EINVAL;
- goto err_free;
- }
- }
-
- prev_offset = linfo[i].insn_off;
- bpfptr_add(&ulinfo, rec_size);
- }
-
- if (s != env->subprog_cnt) {
- verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n",
- env->subprog_cnt - s, s);
- err = -EINVAL;
- goto err_free;
- }
-
- prog->aux->linfo = linfo;
- prog->aux->nr_linfo = nr_linfo;
-
- return 0;
-
-err_free:
- kvfree(linfo);
- return err;
-}
-
-#define MIN_CORE_RELO_SIZE sizeof(struct bpf_core_relo)
-#define MAX_CORE_RELO_SIZE MAX_FUNCINFO_REC_SIZE
-
-static int check_core_relo(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- bpfptr_t uattr)
-{
- u32 i, nr_core_relo, ncopy, expected_size, rec_size;
- struct bpf_core_relo core_relo = {};
- struct bpf_prog *prog = env->prog;
- const struct btf *btf = prog->aux->btf;
- struct bpf_core_ctx ctx = {
- .log = &env->log,
- .btf = btf,
- };
- bpfptr_t u_core_relo;
- int err;
-
- nr_core_relo = attr->core_relo_cnt;
- if (!nr_core_relo)
- return 0;
- if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo))
- return -EINVAL;
-
- rec_size = attr->core_relo_rec_size;
- if (rec_size < MIN_CORE_RELO_SIZE ||
- rec_size > MAX_CORE_RELO_SIZE ||
- rec_size % sizeof(u32))
- return -EINVAL;
-
- u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel);
- expected_size = sizeof(struct bpf_core_relo);
- ncopy = min_t(u32, expected_size, rec_size);
-
- /* Unlike func_info and line_info, copy and apply each CO-RE
- * relocation record one at a time.
- */
- for (i = 0; i < nr_core_relo; i++) {
- /* future proofing when sizeof(bpf_core_relo) changes */
- err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size);
- if (err) {
- if (err == -E2BIG) {
- verbose(env, "nonzero tailing record in core_relo");
- if (copy_to_bpfptr_offset(uattr,
- offsetof(union bpf_attr, core_relo_rec_size),
- &expected_size, sizeof(expected_size)))
- err = -EFAULT;
- }
- break;
- }
-
- if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) {
- err = -EFAULT;
- break;
- }
-
- if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) {
- verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n",
- i, core_relo.insn_off, prog->len);
- err = -EINVAL;
- break;
- }
-
- err = bpf_core_apply(&ctx, &core_relo, i,
- &prog->insnsi[core_relo.insn_off / 8]);
- if (err)
- break;
- bpfptr_add(&u_core_relo, rec_size);
- }
- return err;
-}
-
-static int check_btf_info_early(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- bpfptr_t uattr)
-{
- struct btf *btf;
- int err;
-
- if (!attr->func_info_cnt && !attr->line_info_cnt) {
- if (check_abnormal_return(env))
- return -EINVAL;
- return 0;
- }
-
- btf = btf_get_by_fd(attr->prog_btf_fd);
- if (IS_ERR(btf))
- return PTR_ERR(btf);
- if (btf_is_kernel(btf)) {
- btf_put(btf);
- return -EACCES;
- }
- env->prog->aux->btf = btf;
-
- err = check_btf_func_early(env, attr, uattr);
- if (err)
- return err;
- return 0;
-}
-
-static int check_btf_info(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- bpfptr_t uattr)
-{
- int err;
-
- if (!attr->func_info_cnt && !attr->line_info_cnt) {
- if (check_abnormal_return(env))
- return -EINVAL;
- return 0;
- }
-
- err = check_btf_func(env, attr, uattr);
- if (err)
- return err;
-
- err = check_btf_line(env, attr, uattr);
- if (err)
- return err;
-
- err = check_core_relo(env, attr, uattr);
- if (err)
- return err;
-
- return 0;
-}
-
-/* check %cur's range satisfies %old's */
-static bool range_within(const struct bpf_reg_state *old,
- const struct bpf_reg_state *cur)
-{
- return old->umin_value <= cur->umin_value &&
- old->umax_value >= cur->umax_value &&
- old->smin_value <= cur->smin_value &&
- old->smax_value >= cur->smax_value &&
- old->u32_min_value <= cur->u32_min_value &&
- old->u32_max_value >= cur->u32_max_value &&
- old->s32_min_value <= cur->s32_min_value &&
- old->s32_max_value >= cur->s32_max_value;
-}
-
-/* If in the old state two registers had the same id, then they need to have
- * the same id in the new state as well. But that id could be different from
- * the old state, so we need to track the mapping from old to new ids.
- * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent
- * regs with old id 5 must also have new id 9 for the new state to be safe. But
- * regs with a different old id could still have new id 9, we don't care about
- * that.
- * So we look through our idmap to see if this old id has been seen before. If
- * so, we require the new id to match; otherwise, we add the id pair to the map.
- */
-static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
-{
- struct bpf_id_pair *map = idmap->map;
- unsigned int i;
-
- /* either both IDs should be set or both should be zero */
- if (!!old_id != !!cur_id)
- return false;
-
- if (old_id == 0) /* cur_id == 0 as well */
- return true;
-
- for (i = 0; i < idmap->cnt; i++) {
- if (map[i].old == old_id)
- return map[i].cur == cur_id;
- if (map[i].cur == cur_id)
- return false;
- }
-
- /* Reached the end of known mappings; haven't seen this id before */
- if (idmap->cnt < BPF_ID_MAP_SIZE) {
- map[idmap->cnt].old = old_id;
- map[idmap->cnt].cur = cur_id;
- idmap->cnt++;
- return true;
- }
-
- /* We ran out of idmap slots, which should be impossible */
- WARN_ON_ONCE(1);
- return false;
-}
-
-/*
- * Compare scalar register IDs for state equivalence.
- *
- * When old_id == 0, the old register is independent - not linked to any
- * other register. Any linking in the current state only adds constraints,
- * making it more restrictive. Since the old state didn't rely on any ID
- * relationships for this register, it's always safe to accept cur regardless
- * of its ID. Hence, return true immediately.
- *
- * When old_id != 0 but cur_id == 0, we need to ensure that different
- * independent registers in cur don't incorrectly satisfy the ID matching
- * requirements of linked registers in old.
- *
- * Example: if old has r6.id=X and r7.id=X (linked), but cur has r6.id=0
- * and r7.id=0 (both independent), without temp IDs both would map old_id=X
- * to cur_id=0 and pass. With temp IDs: r6 maps X->temp1, r7 tries to map
- * X->temp2, but X is already mapped to temp1, so the check fails correctly.
- */
-static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
-{
- if (!old_id)
- return true;
-
- cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen;
-
- return check_ids(old_id, cur_id, idmap);
-}
-
-static void clean_func_state(struct bpf_verifier_env *env,
- struct bpf_func_state *st,
- u32 ip)
-{
- u16 live_regs = env->insn_aux_data[ip].live_regs_before;
- int i, j;
-
- for (i = 0; i < BPF_REG_FP; i++) {
- /* liveness must not touch this register anymore */
- if (!(live_regs & BIT(i)))
- /* since the register is unused, clear its state
- * to make further comparison simpler
- */
- __mark_reg_not_init(env, &st->regs[i]);
- }
-
- for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
- if (!bpf_stack_slot_alive(env, st->frameno, i)) {
- __mark_reg_not_init(env, &st->stack[i].spilled_ptr);
- for (j = 0; j < BPF_REG_SIZE; j++)
- st->stack[i].slot_type[j] = STACK_INVALID;
- }
- }
-}
-
-static void clean_verifier_state(struct bpf_verifier_env *env,
- struct bpf_verifier_state *st)
-{
- int i, ip;
-
- bpf_live_stack_query_init(env, st);
- st->cleaned = true;
- for (i = 0; i <= st->curframe; i++) {
- ip = frame_insn_idx(st, i);
- clean_func_state(env, st->frame[i], ip);
- }
-}
-
-/* the parentage chains form a tree.
- * the verifier states are added to state lists at given insn and
- * pushed into state stack for future exploration.
- * when the verifier reaches bpf_exit insn some of the verifier states
- * stored in the state lists have their final liveness state already,
- * but a lot of states will get revised from liveness point of view when
- * the verifier explores other branches.
- * Example:
- * 1: *(u64)(r10 - 8) = 1
- * 2: if r1 == 100 goto pc+1
- * 3: *(u64)(r10 - 8) = 2
- * 4: r0 = *(u64)(r10 - 8)
- * 5: exit
- * when the verifier reaches exit insn the stack slot -8 in the state list of
- * insn 2 is not yet marked alive. Then the verifier pops the other_branch
- * of insn 2 and goes exploring further. After the insn 4 read, liveness
- * analysis would propagate read mark for -8 at insn 2.
- *
- * Since the verifier pushes the branch states as it sees them while exploring
- * the program the condition of walking the branch instruction for the second
- * time means that all states below this branch were already explored and
- * their final liveness marks are already propagated.
- * Hence when the verifier completes the search of state list in is_state_visited()
- * we can call this clean_live_states() function to clear dead the registers and stack
- * slots to simplify state merging.
- *
- * Important note here that walking the same branch instruction in the callee
- * doesn't meant that the states are DONE. The verifier has to compare
- * the callsites
- */
-
/* Find id in idset and increment its count, or add new entry */
static void idset_cnt_inc(struct bpf_idset *idset, u32 id)
{
@@ -19745,8 +17273,8 @@ static u32 idset_cnt_get(struct bpf_idset *idset, u32 id)
* A register with a non-zero id is called singular if no other register shares
* the same base id. Such registers can be treated as independent (id=0).
*/
-static void clear_singular_ids(struct bpf_verifier_env *env,
- struct bpf_verifier_state *st)
+void bpf_clear_singular_ids(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
{
struct bpf_idset *idset = &env->idset_scratch;
struct bpf_func_state *func;
@@ -19767,1072 +17295,11 @@ static void clear_singular_ids(struct bpf_verifier_env *env,
continue;
if (!reg->id)
continue;
- if (idset_cnt_get(idset, reg->id & ~BPF_ADD_CONST) == 1) {
- reg->id = 0;
- reg->off = 0;
- }
+ if (idset_cnt_get(idset, reg->id & ~BPF_ADD_CONST) == 1)
+ clear_scalar_id(reg);
}));
}
-static void clean_live_states(struct bpf_verifier_env *env, int insn,
- struct bpf_verifier_state *cur)
-{
- struct bpf_verifier_state_list *sl;
- struct list_head *pos, *head;
-
- head = explored_state(env, insn);
- list_for_each(pos, head) {
- sl = container_of(pos, struct bpf_verifier_state_list, node);
- if (sl->state.branches)
- continue;
- if (sl->state.insn_idx != insn ||
- !same_callsites(&sl->state, cur))
- continue;
- if (sl->state.cleaned)
- /* all regs in this state in all frames were already marked */
- continue;
- if (incomplete_read_marks(env, &sl->state))
- continue;
- clean_verifier_state(env, &sl->state);
- }
-}
-
-static bool regs_exact(const struct bpf_reg_state *rold,
- const struct bpf_reg_state *rcur,
- struct bpf_idmap *idmap)
-{
- return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
- check_ids(rold->id, rcur->id, idmap) &&
- check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
-}
-
-enum exact_level {
- NOT_EXACT,
- EXACT,
- RANGE_WITHIN
-};
-
-/* Returns true if (rold safe implies rcur safe) */
-static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
- struct bpf_reg_state *rcur, struct bpf_idmap *idmap,
- enum exact_level exact)
-{
- if (exact == EXACT)
- return regs_exact(rold, rcur, idmap);
-
- if (rold->type == NOT_INIT)
- /* explored state can't have used this */
- return true;
-
- /* Enforce that register types have to match exactly, including their
- * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general
- * rule.
- *
- * One can make a point that using a pointer register as unbounded
- * SCALAR would be technically acceptable, but this could lead to
- * pointer leaks because scalars are allowed to leak while pointers
- * are not. We could make this safe in special cases if root is
- * calling us, but it's probably not worth the hassle.
- *
- * Also, register types that are *not* MAYBE_NULL could technically be
- * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE
- * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point
- * to the same map).
- * However, if the old MAYBE_NULL register then got NULL checked,
- * doing so could have affected others with the same id, and we can't
- * check for that because we lost the id when we converted to
- * a non-MAYBE_NULL variant.
- * So, as a general rule we don't allow mixing MAYBE_NULL and
- * non-MAYBE_NULL registers as well.
- */
- if (rold->type != rcur->type)
- return false;
-
- switch (base_type(rold->type)) {
- case SCALAR_VALUE:
- if (env->explore_alu_limits) {
- /* explore_alu_limits disables tnum_in() and range_within()
- * logic and requires everything to be strict
- */
- return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
- check_scalar_ids(rold->id, rcur->id, idmap);
- }
- if (!rold->precise && exact == NOT_EXACT)
- return true;
- /*
- * Linked register tracking uses rold->id to detect relationships.
- * When rold->id == 0, the register is independent and any linking
- * in rcur only adds constraints. When rold->id != 0, we must verify
- * id mapping and (for BPF_ADD_CONST) offset consistency.
- *
- * +------------------+-----------+------------------+---------------+
- * | | rold->id | rold + ADD_CONST | rold->id == 0 |
- * |------------------+-----------+------------------+---------------|
- * | rcur->id | range,ids | false | range |
- * | rcur + ADD_CONST | false | range,ids,off | range |
- * | rcur->id == 0 | range,ids | false | range |
- * +------------------+-----------+------------------+---------------+
- *
- * Why check_ids() for scalar registers?
- *
- * Consider the following BPF code:
- * 1: r6 = ... unbound scalar, ID=a ...
- * 2: r7 = ... unbound scalar, ID=b ...
- * 3: if (r6 > r7) goto +1
- * 4: r6 = r7
- * 5: if (r6 > X) goto ...
- * 6: ... memory operation using r7 ...
- *
- * First verification path is [1-6]:
- * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7;
- * - at (5) r6 would be marked <= X, sync_linked_regs() would also mark
- * r7 <= X, because r6 and r7 share same id.
- * Next verification path is [1-4, 6].
- *
- * Instruction (6) would be reached in two states:
- * I. r6{.id=b}, r7{.id=b} via path 1-6;
- * II. r6{.id=a}, r7{.id=b} via path 1-4, 6.
- *
- * Use check_ids() to distinguish these states.
- * ---
- * Also verify that new value satisfies old value range knowledge.
- */
-
- /*
- * ADD_CONST flags must match exactly: BPF_ADD_CONST32 and
- * BPF_ADD_CONST64 have different linking semantics in
- * sync_linked_regs() (alu32 zero-extends, alu64 does not),
- * so pruning across different flag types is unsafe.
- */
- if (rold->id &&
- (rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST))
- return false;
-
- /* Both have offset linkage: offsets must match */
- if ((rold->id & BPF_ADD_CONST) && rold->off != rcur->off)
- return false;
-
- if (!check_scalar_ids(rold->id, rcur->id, idmap))
- return false;
-
- return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off);
- case PTR_TO_MAP_KEY:
- case PTR_TO_MAP_VALUE:
- case PTR_TO_MEM:
- case PTR_TO_BUF:
- case PTR_TO_TP_BUFFER:
- /* If the new min/max/var_off satisfy the old ones and
- * everything else matches, we are OK.
- */
- return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
- range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off) &&
- check_ids(rold->id, rcur->id, idmap) &&
- check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
- case PTR_TO_PACKET_META:
- case PTR_TO_PACKET:
- /* We must have at least as much range as the old ptr
- * did, so that any accesses which were safe before are
- * still safe. This is true even if old range < old off,
- * since someone could have accessed through (ptr - k), or
- * even done ptr -= k in a register, to get a safe access.
- */
- if (rold->range < 0 || rcur->range < 0) {
- /* special case for [BEYOND|AT]_PKT_END */
- if (rold->range != rcur->range)
- return false;
- } else if (rold->range > rcur->range) {
- return false;
- }
- /* If the offsets don't match, we can't trust our alignment;
- * nor can we be sure that we won't fall out of range.
- */
- if (rold->off != rcur->off)
- return false;
- /* id relations must be preserved */
- if (!check_ids(rold->id, rcur->id, idmap))
- return false;
- /* new val must satisfy old val knowledge */
- return range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off);
- case PTR_TO_STACK:
- /* two stack pointers are equal only if they're pointing to
- * the same stack frame, since fp-8 in foo != fp-8 in bar
- */
- return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
- case PTR_TO_ARENA:
- return true;
- case PTR_TO_INSN:
- return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
- rold->off == rcur->off && range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off);
- default:
- return regs_exact(rold, rcur, idmap);
- }
-}
-
-static struct bpf_reg_state unbound_reg;
-
-static __init int unbound_reg_init(void)
-{
- __mark_reg_unknown_imprecise(&unbound_reg);
- return 0;
-}
-late_initcall(unbound_reg_init);
-
-static bool is_stack_all_misc(struct bpf_verifier_env *env,
- struct bpf_stack_state *stack)
-{
- u32 i;
-
- for (i = 0; i < ARRAY_SIZE(stack->slot_type); ++i) {
- if ((stack->slot_type[i] == STACK_MISC) ||
- (stack->slot_type[i] == STACK_INVALID && env->allow_uninit_stack))
- continue;
- return false;
- }
-
- return true;
-}
-
-static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env,
- struct bpf_stack_state *stack)
-{
- if (is_spilled_scalar_reg64(stack))
- return &stack->spilled_ptr;
-
- if (is_stack_all_misc(env, stack))
- return &unbound_reg;
-
- return NULL;
-}
-
-static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
- struct bpf_func_state *cur, struct bpf_idmap *idmap,
- enum exact_level exact)
-{
- int i, spi;
-
- /* walk slots of the explored stack and ignore any additional
- * slots in the current stack, since explored(safe) state
- * didn't use them
- */
- for (i = 0; i < old->allocated_stack; i++) {
- struct bpf_reg_state *old_reg, *cur_reg;
-
- spi = i / BPF_REG_SIZE;
-
- if (exact == EXACT &&
- (i >= cur->allocated_stack ||
- old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
- cur->stack[spi].slot_type[i % BPF_REG_SIZE]))
- return false;
-
- if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
- continue;
-
- if (env->allow_uninit_stack &&
- old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC)
- continue;
-
- /* explored stack has more populated slots than current stack
- * and these slots were used
- */
- if (i >= cur->allocated_stack)
- return false;
-
- /* 64-bit scalar spill vs all slots MISC and vice versa.
- * Load from all slots MISC produces unbound scalar.
- * Construct a fake register for such stack and call
- * regsafe() to ensure scalar ids are compared.
- */
- old_reg = scalar_reg_for_stack(env, &old->stack[spi]);
- cur_reg = scalar_reg_for_stack(env, &cur->stack[spi]);
- if (old_reg && cur_reg) {
- if (!regsafe(env, old_reg, cur_reg, idmap, exact))
- return false;
- i += BPF_REG_SIZE - 1;
- continue;
- }
-
- /* if old state was safe with misc data in the stack
- * it will be safe with zero-initialized stack.
- * The opposite is not true
- */
- if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC &&
- cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO)
- continue;
- if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
- cur->stack[spi].slot_type[i % BPF_REG_SIZE])
- /* Ex: old explored (safe) state has STACK_SPILL in
- * this stack slot, but current has STACK_MISC ->
- * this verifier states are not equivalent,
- * return false to continue verification of this path
- */
- return false;
- if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)
- continue;
- /* Both old and cur are having same slot_type */
- switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) {
- case STACK_SPILL:
- /* when explored and current stack slot are both storing
- * spilled registers, check that stored pointers types
- * are the same as well.
- * Ex: explored safe path could have stored
- * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
- * but current path has stored:
- * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
- * such verifier states are not equivalent.
- * return false to continue verification of this path
- */
- if (!regsafe(env, &old->stack[spi].spilled_ptr,
- &cur->stack[spi].spilled_ptr, idmap, exact))
- return false;
- break;
- case STACK_DYNPTR:
- old_reg = &old->stack[spi].spilled_ptr;
- cur_reg = &cur->stack[spi].spilled_ptr;
- if (old_reg->dynptr.type != cur_reg->dynptr.type ||
- old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot ||
- !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
- return false;
- break;
- case STACK_ITER:
- old_reg = &old->stack[spi].spilled_ptr;
- cur_reg = &cur->stack[spi].spilled_ptr;
- /* iter.depth is not compared between states as it
- * doesn't matter for correctness and would otherwise
- * prevent convergence; we maintain it only to prevent
- * infinite loop check triggering, see
- * iter_active_depths_differ()
- */
- if (old_reg->iter.btf != cur_reg->iter.btf ||
- old_reg->iter.btf_id != cur_reg->iter.btf_id ||
- old_reg->iter.state != cur_reg->iter.state ||
- /* ignore {old_reg,cur_reg}->iter.depth, see above */
- !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
- return false;
- break;
- case STACK_IRQ_FLAG:
- old_reg = &old->stack[spi].spilled_ptr;
- cur_reg = &cur->stack[spi].spilled_ptr;
- if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) ||
- old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class)
- return false;
- break;
- case STACK_MISC:
- case STACK_ZERO:
- case STACK_INVALID:
- continue;
- /* Ensure that new unhandled slot types return false by default */
- default:
- return false;
- }
- }
- return true;
-}
-
-static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur,
- struct bpf_idmap *idmap)
-{
- int i;
-
- if (old->acquired_refs != cur->acquired_refs)
- return false;
-
- if (old->active_locks != cur->active_locks)
- return false;
-
- if (old->active_preempt_locks != cur->active_preempt_locks)
- return false;
-
- if (old->active_rcu_locks != cur->active_rcu_locks)
- return false;
-
- if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap))
- return false;
-
- if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) ||
- old->active_lock_ptr != cur->active_lock_ptr)
- return false;
-
- for (i = 0; i < old->acquired_refs; i++) {
- if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) ||
- old->refs[i].type != cur->refs[i].type)
- return false;
- switch (old->refs[i].type) {
- case REF_TYPE_PTR:
- case REF_TYPE_IRQ:
- break;
- case REF_TYPE_LOCK:
- case REF_TYPE_RES_LOCK:
- case REF_TYPE_RES_LOCK_IRQ:
- if (old->refs[i].ptr != cur->refs[i].ptr)
- return false;
- break;
- default:
- WARN_ONCE(1, "Unhandled enum type for reference state: %d\n", old->refs[i].type);
- return false;
- }
- }
-
- return true;
-}
-
-/* compare two verifier states
- *
- * all states stored in state_list are known to be valid, since
- * verifier reached 'bpf_exit' instruction through them
- *
- * this function is called when verifier exploring different branches of
- * execution popped from the state stack. If it sees an old state that has
- * more strict register state and more strict stack state then this execution
- * branch doesn't need to be explored further, since verifier already
- * concluded that more strict state leads to valid finish.
- *
- * Therefore two states are equivalent if register state is more conservative
- * and explored stack state is more conservative than the current one.
- * Example:
- * explored current
- * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC)
- * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC)
- *
- * In other words if current stack state (one being explored) has more
- * valid slots than old one that already passed validation, it means
- * the verifier can stop exploring and conclude that current state is valid too
- *
- * Similarly with registers. If explored state has register type as invalid
- * whereas register type in current state is meaningful, it means that
- * the current state will reach 'bpf_exit' instruction safely
- */
-static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
- struct bpf_func_state *cur, u32 insn_idx, enum exact_level exact)
-{
- u16 live_regs = env->insn_aux_data[insn_idx].live_regs_before;
- u16 i;
-
- if (old->callback_depth > cur->callback_depth)
- return false;
-
- for (i = 0; i < MAX_BPF_REG; i++)
- if (((1 << i) & live_regs) &&
- !regsafe(env, &old->regs[i], &cur->regs[i],
- &env->idmap_scratch, exact))
- return false;
-
- if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
- return false;
-
- return true;
-}
-
-static void reset_idmap_scratch(struct bpf_verifier_env *env)
-{
- struct bpf_idmap *idmap = &env->idmap_scratch;
-
- idmap->tmp_id_gen = env->id_gen;
- idmap->cnt = 0;
-}
-
-static bool states_equal(struct bpf_verifier_env *env,
- struct bpf_verifier_state *old,
- struct bpf_verifier_state *cur,
- enum exact_level exact)
-{
- u32 insn_idx;
- int i;
-
- if (old->curframe != cur->curframe)
- return false;
-
- reset_idmap_scratch(env);
-
- /* Verification state from speculative execution simulation
- * must never prune a non-speculative execution one.
- */
- if (old->speculative && !cur->speculative)
- return false;
-
- if (old->in_sleepable != cur->in_sleepable)
- return false;
-
- if (!refsafe(old, cur, &env->idmap_scratch))
- return false;
-
- /* for states to be equal callsites have to be the same
- * and all frame states need to be equivalent
- */
- for (i = 0; i <= old->curframe; i++) {
- insn_idx = frame_insn_idx(old, i);
- if (old->frame[i]->callsite != cur->frame[i]->callsite)
- return false;
- if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact))
- return false;
- }
- return true;
-}
-
-/* find precise scalars in the previous equivalent state and
- * propagate them into the current state
- */
-static int propagate_precision(struct bpf_verifier_env *env,
- const struct bpf_verifier_state *old,
- struct bpf_verifier_state *cur,
- bool *changed)
-{
- struct bpf_reg_state *state_reg;
- struct bpf_func_state *state;
- int i, err = 0, fr;
- bool first;
-
- for (fr = old->curframe; fr >= 0; fr--) {
- state = old->frame[fr];
- state_reg = state->regs;
- first = true;
- for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
- if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
- continue;
- if (env->log.level & BPF_LOG_LEVEL2) {
- if (first)
- verbose(env, "frame %d: propagating r%d", fr, i);
- else
- verbose(env, ",r%d", i);
- }
- bt_set_frame_reg(&env->bt, fr, i);
- first = false;
- }
-
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- if (!is_spilled_reg(&state->stack[i]))
- continue;
- state_reg = &state->stack[i].spilled_ptr;
- if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
- continue;
- if (env->log.level & BPF_LOG_LEVEL2) {
- if (first)
- verbose(env, "frame %d: propagating fp%d",
- fr, (-i - 1) * BPF_REG_SIZE);
- else
- verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE);
- }
- bt_set_frame_slot(&env->bt, fr, i);
- first = false;
- }
- if (!first && (env->log.level & BPF_LOG_LEVEL2))
- verbose(env, "\n");
- }
-
- err = __mark_chain_precision(env, cur, -1, changed);
- if (err < 0)
- return err;
-
- return 0;
-}
-
-#define MAX_BACKEDGE_ITERS 64
-
-/* Propagate read and precision marks from visit->backedges[*].state->equal_state
- * to corresponding parent states of visit->backedges[*].state until fixed point is reached,
- * then free visit->backedges.
- * After execution of this function incomplete_read_marks() will return false
- * for all states corresponding to @visit->callchain.
- */
-static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit)
-{
- struct bpf_scc_backedge *backedge;
- struct bpf_verifier_state *st;
- bool changed;
- int i, err;
-
- i = 0;
- do {
- if (i++ > MAX_BACKEDGE_ITERS) {
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "%s: too many iterations\n", __func__);
- for (backedge = visit->backedges; backedge; backedge = backedge->next)
- mark_all_scalars_precise(env, &backedge->state);
- break;
- }
- changed = false;
- for (backedge = visit->backedges; backedge; backedge = backedge->next) {
- st = &backedge->state;
- err = propagate_precision(env, st->equal_state, st, &changed);
- if (err)
- return err;
- }
- } while (changed);
-
- free_backedges(visit);
- return 0;
-}
-
-static bool states_maybe_looping(struct bpf_verifier_state *old,
- struct bpf_verifier_state *cur)
-{
- struct bpf_func_state *fold, *fcur;
- int i, fr = cur->curframe;
-
- if (old->curframe != fr)
- return false;
-
- fold = old->frame[fr];
- fcur = cur->frame[fr];
- for (i = 0; i < MAX_BPF_REG; i++)
- if (memcmp(&fold->regs[i], &fcur->regs[i],
- offsetof(struct bpf_reg_state, frameno)))
- return false;
- return true;
-}
-
-static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
-{
- return env->insn_aux_data[insn_idx].is_iter_next;
-}
-
-/* is_state_visited() handles iter_next() (see process_iter_next_call() for
- * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
- * states to match, which otherwise would look like an infinite loop. So while
- * iter_next() calls are taken care of, we still need to be careful and
- * prevent erroneous and too eager declaration of "infinite loop", when
- * iterators are involved.
- *
- * Here's a situation in pseudo-BPF assembly form:
- *
- * 0: again: ; set up iter_next() call args
- * 1: r1 = &it ; <CHECKPOINT HERE>
- * 2: call bpf_iter_num_next ; this is iter_next() call
- * 3: if r0 == 0 goto done
- * 4: ... something useful here ...
- * 5: goto again ; another iteration
- * 6: done:
- * 7: r1 = &it
- * 8: call bpf_iter_num_destroy ; clean up iter state
- * 9: exit
- *
- * This is a typical loop. Let's assume that we have a prune point at 1:,
- * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
- * again`, assuming other heuristics don't get in a way).
- *
- * When we first time come to 1:, let's say we have some state X. We proceed
- * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
- * Now we come back to validate that forked ACTIVE state. We proceed through
- * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
- * are converging. But the problem is that we don't know that yet, as this
- * convergence has to happen at iter_next() call site only. So if nothing is
- * done, at 1: verifier will use bounded loop logic and declare infinite
- * looping (and would be *technically* correct, if not for iterator's
- * "eventual sticky NULL" contract, see process_iter_next_call()). But we
- * don't want that. So what we do in process_iter_next_call() when we go on
- * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
- * a different iteration. So when we suspect an infinite loop, we additionally
- * check if any of the *ACTIVE* iterator states depths differ. If yes, we
- * pretend we are not looping and wait for next iter_next() call.
- *
- * This only applies to ACTIVE state. In DRAINED state we don't expect to
- * loop, because that would actually mean infinite loop, as DRAINED state is
- * "sticky", and so we'll keep returning into the same instruction with the
- * same state (at least in one of possible code paths).
- *
- * This approach allows to keep infinite loop heuristic even in the face of
- * active iterator. E.g., C snippet below is and will be detected as
- * infinitely looping:
- *
- * struct bpf_iter_num it;
- * int *p, x;
- *
- * bpf_iter_num_new(&it, 0, 10);
- * while ((p = bpf_iter_num_next(&t))) {
- * x = p;
- * while (x--) {} // <<-- infinite loop here
- * }
- *
- */
-static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
-{
- struct bpf_reg_state *slot, *cur_slot;
- struct bpf_func_state *state;
- int i, fr;
-
- for (fr = old->curframe; fr >= 0; fr--) {
- state = old->frame[fr];
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- if (state->stack[i].slot_type[0] != STACK_ITER)
- continue;
-
- slot = &state->stack[i].spilled_ptr;
- if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
- continue;
-
- cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
- if (cur_slot->iter.depth != slot->iter.depth)
- return true;
- }
- }
- return false;
-}
-
-static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
-{
- struct bpf_verifier_state_list *new_sl;
- struct bpf_verifier_state_list *sl;
- struct bpf_verifier_state *cur = env->cur_state, *new;
- bool force_new_state, add_new_state, loop;
- int n, err, states_cnt = 0;
- struct list_head *pos, *tmp, *head;
-
- force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
- /* Avoid accumulating infinitely long jmp history */
- cur->jmp_history_cnt > 40;
-
- /* bpf progs typically have pruning point every 4 instructions
- * http://vger.kernel.org/bpfconf2019.html#session-1
- * Do not add new state for future pruning if the verifier hasn't seen
- * at least 2 jumps and at least 8 instructions.
- * This heuristics helps decrease 'total_states' and 'peak_states' metric.
- * In tests that amounts to up to 50% reduction into total verifier
- * memory consumption and 20% verifier time speedup.
- */
- add_new_state = force_new_state;
- if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
- env->insn_processed - env->prev_insn_processed >= 8)
- add_new_state = true;
-
- clean_live_states(env, insn_idx, cur);
-
- loop = false;
- head = explored_state(env, insn_idx);
- list_for_each_safe(pos, tmp, head) {
- sl = container_of(pos, struct bpf_verifier_state_list, node);
- states_cnt++;
- if (sl->state.insn_idx != insn_idx)
- continue;
-
- if (sl->state.branches) {
- struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];
-
- if (frame->in_async_callback_fn &&
- frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) {
- /* Different async_entry_cnt means that the verifier is
- * processing another entry into async callback.
- * Seeing the same state is not an indication of infinite
- * loop or infinite recursion.
- * But finding the same state doesn't mean that it's safe
- * to stop processing the current state. The previous state
- * hasn't yet reached bpf_exit, since state.branches > 0.
- * Checking in_async_callback_fn alone is not enough either.
- * Since the verifier still needs to catch infinite loops
- * inside async callbacks.
- */
- goto skip_inf_loop_check;
- }
- /* BPF open-coded iterators loop detection is special.
- * states_maybe_looping() logic is too simplistic in detecting
- * states that *might* be equivalent, because it doesn't know
- * about ID remapping, so don't even perform it.
- * See process_iter_next_call() and iter_active_depths_differ()
- * for overview of the logic. When current and one of parent
- * states are detected as equivalent, it's a good thing: we prove
- * convergence and can stop simulating further iterations.
- * It's safe to assume that iterator loop will finish, taking into
- * account iter_next() contract of eventually returning
- * sticky NULL result.
- *
- * Note, that states have to be compared exactly in this case because
- * read and precision marks might not be finalized inside the loop.
- * E.g. as in the program below:
- *
- * 1. r7 = -16
- * 2. r6 = bpf_get_prandom_u32()
- * 3. while (bpf_iter_num_next(&fp[-8])) {
- * 4. if (r6 != 42) {
- * 5. r7 = -32
- * 6. r6 = bpf_get_prandom_u32()
- * 7. continue
- * 8. }
- * 9. r0 = r10
- * 10. r0 += r7
- * 11. r8 = *(u64 *)(r0 + 0)
- * 12. r6 = bpf_get_prandom_u32()
- * 13. }
- *
- * Here verifier would first visit path 1-3, create a checkpoint at 3
- * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does
- * not have read or precision mark for r7 yet, thus inexact states
- * comparison would discard current state with r7=-32
- * => unsafe memory access at 11 would not be caught.
- */
- if (is_iter_next_insn(env, insn_idx)) {
- if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
- struct bpf_func_state *cur_frame;
- struct bpf_reg_state *iter_state, *iter_reg;
- int spi;
-
- cur_frame = cur->frame[cur->curframe];
- /* btf_check_iter_kfuncs() enforces that
- * iter state pointer is always the first arg
- */
- iter_reg = &cur_frame->regs[BPF_REG_1];
- /* current state is valid due to states_equal(),
- * so we can assume valid iter and reg state,
- * no need for extra (re-)validations
- */
- spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
- iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
- if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
- loop = true;
- goto hit;
- }
- }
- goto skip_inf_loop_check;
- }
- if (is_may_goto_insn_at(env, insn_idx)) {
- if (sl->state.may_goto_depth != cur->may_goto_depth &&
- states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
- loop = true;
- goto hit;
- }
- }
- if (bpf_calls_callback(env, insn_idx)) {
- if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
- loop = true;
- goto hit;
- }
- goto skip_inf_loop_check;
- }
- /* attempt to detect infinite loop to avoid unnecessary doomed work */
- if (states_maybe_looping(&sl->state, cur) &&
- states_equal(env, &sl->state, cur, EXACT) &&
- !iter_active_depths_differ(&sl->state, cur) &&
- sl->state.may_goto_depth == cur->may_goto_depth &&
- sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
- verbose_linfo(env, insn_idx, "; ");
- verbose(env, "infinite loop detected at insn %d\n", insn_idx);
- verbose(env, "cur state:");
- print_verifier_state(env, cur, cur->curframe, true);
- verbose(env, "old state:");
- print_verifier_state(env, &sl->state, cur->curframe, true);
- return -EINVAL;
- }
- /* if the verifier is processing a loop, avoid adding new state
- * too often, since different loop iterations have distinct
- * states and may not help future pruning.
- * This threshold shouldn't be too low to make sure that
- * a loop with large bound will be rejected quickly.
- * The most abusive loop will be:
- * r1 += 1
- * if r1 < 1000000 goto pc-2
- * 1M insn_procssed limit / 100 == 10k peak states.
- * This threshold shouldn't be too high either, since states
- * at the end of the loop are likely to be useful in pruning.
- */
-skip_inf_loop_check:
- if (!force_new_state &&
- env->jmps_processed - env->prev_jmps_processed < 20 &&
- env->insn_processed - env->prev_insn_processed < 100)
- add_new_state = false;
- goto miss;
- }
- /* See comments for mark_all_regs_read_and_precise() */
- loop = incomplete_read_marks(env, &sl->state);
- if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) {
-hit:
- sl->hit_cnt++;
-
- /* if previous state reached the exit with precision and
- * current state is equivalent to it (except precision marks)
- * the precision needs to be propagated back in
- * the current state.
- */
- err = 0;
- if (is_jmp_point(env, env->insn_idx))
- err = push_jmp_history(env, cur, 0, 0);
- err = err ? : propagate_precision(env, &sl->state, cur, NULL);
- if (err)
- return err;
- /* When processing iterator based loops above propagate_liveness and
- * propagate_precision calls are not sufficient to transfer all relevant
- * read and precision marks. E.g. consider the following case:
- *
- * .-> A --. Assume the states are visited in the order A, B, C.
- * | | | Assume that state B reaches a state equivalent to state A.
- * | v v At this point, state C is not processed yet, so state A
- * '-- B C has not received any read or precision marks from C.
- * Thus, marks propagated from A to B are incomplete.
- *
- * The verifier mitigates this by performing the following steps:
- *
- * - Prior to the main verification pass, strongly connected components
- * (SCCs) are computed over the program's control flow graph,
- * intraprocedurally.
- *
- * - During the main verification pass, `maybe_enter_scc()` checks
- * whether the current verifier state is entering an SCC. If so, an
- * instance of a `bpf_scc_visit` object is created, and the state
- * entering the SCC is recorded as the entry state.
- *
- * - This instance is associated not with the SCC itself, but with a
- * `bpf_scc_callchain`: a tuple consisting of the call sites leading to
- * the SCC and the SCC id. See `compute_scc_callchain()`.
- *
- * - When a verification path encounters a `states_equal(...,
- * RANGE_WITHIN)` condition, there exists a call chain describing the
- * current state and a corresponding `bpf_scc_visit` instance. A copy
- * of the current state is created and added to
- * `bpf_scc_visit->backedges`.
- *
- * - When a verification path terminates, `maybe_exit_scc()` is called
- * from `update_branch_counts()`. For states with `branches == 0`, it
- * checks whether the state is the entry state of any `bpf_scc_visit`
- * instance. If it is, this indicates that all paths originating from
- * this SCC visit have been explored. `propagate_backedges()` is then
- * called, which propagates read and precision marks through the
- * backedges until a fixed point is reached.
- * (In the earlier example, this would propagate marks from A to B,
- * from C to A, and then again from A to B.)
- *
- * A note on callchains
- * --------------------
- *
- * Consider the following example:
- *
- * void foo() { loop { ... SCC#1 ... } }
- * void main() {
- * A: foo();
- * B: ...
- * C: foo();
- * }
- *
- * Here, there are two distinct callchains leading to SCC#1:
- * - (A, SCC#1)
- * - (C, SCC#1)
- *
- * Each callchain identifies a separate `bpf_scc_visit` instance that
- * accumulates backedge states. The `propagate_{liveness,precision}()`
- * functions traverse the parent state of each backedge state, which
- * means these parent states must remain valid (i.e., not freed) while
- * the corresponding `bpf_scc_visit` instance exists.
- *
- * Associating `bpf_scc_visit` instances directly with SCCs instead of
- * callchains would break this invariant:
- * - States explored during `C: foo()` would contribute backedges to
- * SCC#1, but SCC#1 would only be exited once the exploration of
- * `A: foo()` completes.
- * - By that time, the states explored between `A: foo()` and `C: foo()`
- * (i.e., `B: ...`) may have already been freed, causing the parent
- * links for states from `C: foo()` to become invalid.
- */
- if (loop) {
- struct bpf_scc_backedge *backedge;
-
- backedge = kzalloc_obj(*backedge,
- GFP_KERNEL_ACCOUNT);
- if (!backedge)
- return -ENOMEM;
- err = copy_verifier_state(&backedge->state, cur);
- backedge->state.equal_state = &sl->state;
- backedge->state.insn_idx = insn_idx;
- err = err ?: add_scc_backedge(env, &sl->state, backedge);
- if (err) {
- free_verifier_state(&backedge->state, false);
- kfree(backedge);
- return err;
- }
- }
- return 1;
- }
-miss:
- /* when new state is not going to be added do not increase miss count.
- * Otherwise several loop iterations will remove the state
- * recorded earlier. The goal of these heuristics is to have
- * states from some iterations of the loop (some in the beginning
- * and some at the end) to help pruning.
- */
- if (add_new_state)
- sl->miss_cnt++;
- /* heuristic to determine whether this state is beneficial
- * to keep checking from state equivalence point of view.
- * Higher numbers increase max_states_per_insn and verification time,
- * but do not meaningfully decrease insn_processed.
- * 'n' controls how many times state could miss before eviction.
- * Use bigger 'n' for checkpoints because evicting checkpoint states
- * too early would hinder iterator convergence.
- */
- n = is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3;
- if (sl->miss_cnt > sl->hit_cnt * n + n) {
- /* the state is unlikely to be useful. Remove it to
- * speed up verification
- */
- sl->in_free_list = true;
- list_del(&sl->node);
- list_add(&sl->node, &env->free_list);
- env->free_list_size++;
- env->explored_states_size--;
- maybe_free_verifier_state(env, sl);
- }
- }
-
- if (env->max_states_per_insn < states_cnt)
- env->max_states_per_insn = states_cnt;
-
- if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
- return 0;
-
- if (!add_new_state)
- return 0;
-
- /* There were no equivalent states, remember the current one.
- * Technically the current state is not proven to be safe yet,
- * but it will either reach outer most bpf_exit (which means it's safe)
- * or it will be rejected. When there are no loops the verifier won't be
- * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
- * again on the way to bpf_exit.
- * When looping the sl->state.branches will be > 0 and this state
- * will not be considered for equivalence until branches == 0.
- */
- new_sl = kzalloc_obj(struct bpf_verifier_state_list, GFP_KERNEL_ACCOUNT);
- if (!new_sl)
- return -ENOMEM;
- env->total_states++;
- env->explored_states_size++;
- update_peak_states(env);
- env->prev_jmps_processed = env->jmps_processed;
- env->prev_insn_processed = env->insn_processed;
-
- /* forget precise markings we inherited, see __mark_chain_precision */
- if (env->bpf_capable)
- mark_all_scalars_imprecise(env, cur);
-
- clear_singular_ids(env, cur);
-
- /* add new state to the head of linked list */
- new = &new_sl->state;
- err = copy_verifier_state(new, cur);
- if (err) {
- free_verifier_state(new, false);
- kfree(new_sl);
- return err;
- }
- new->insn_idx = insn_idx;
- verifier_bug_if(new->branches != 1, env,
- "%s:branches_to_explore=%d insn %d",
- __func__, new->branches, insn_idx);
- err = maybe_enter_scc(env, new);
- if (err) {
- free_verifier_state(new, false);
- kfree(new_sl);
- return err;
- }
-
- cur->parent = new;
- cur->first_insn_idx = insn_idx;
- cur->dfs_depth = new->dfs_depth + 1;
- clear_jmp_history(cur);
- list_add(&new_sl->node, head);
- return 0;
-}
-
/* Return true if it's OK to have the same insn return a different type. */
static bool reg_type_mismatch_ok(enum bpf_reg_type type)
{
@@ -20933,13 +17400,16 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ
}
enum {
- PROCESS_BPF_EXIT = 1
+ PROCESS_BPF_EXIT = 1,
+ INSN_IDX_UPDATED = 2,
};
static int process_bpf_exit_full(struct bpf_verifier_env *env,
bool *do_print_state,
bool exception_exit)
{
+ struct bpf_func_state *cur_frame = cur_func(env);
+
/* We must do check_reference_leak here before
* prepare_func_exit to handle the case when
* state->curframe > 0, it may be a callback function,
@@ -20971,10 +17441,24 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env,
if (err)
return err;
*do_print_state = true;
- return 0;
+ return INSN_IDX_UPDATED;
}
- err = check_return_code(env, BPF_REG_0, "R0");
+ /*
+ * Return from a regular global subprogram differs from return
+ * from the main program or async/exception callback.
+ * Main program exit implies return code restrictions
+ * that depend on program type.
+ * Exit from exception callback is equivalent to main program exit.
+ * Exit from async callback implies return code restrictions
+ * that depend on async scheduling mechanism.
+ */
+ if (cur_frame->subprogno &&
+ !cur_frame->in_async_callback_fn &&
+ !cur_frame->in_exception_callback_fn)
+ err = check_global_subprog_return_code(env);
+ else
+ err = check_return_code(env, BPF_REG_0, "R0");
if (err)
return err;
return PROCESS_BPF_EXIT;
@@ -20986,19 +17470,16 @@ static int indirect_jump_min_max_index(struct bpf_verifier_env *env,
u32 *pmin_index, u32 *pmax_index)
{
struct bpf_reg_state *reg = reg_state(env, regno);
- u64 min_index, max_index;
+ u64 min_index = reg->umin_value;
+ u64 max_index = reg->umax_value;
const u32 size = 8;
- if (check_add_overflow(reg->umin_value, reg->off, &min_index) ||
- (min_index > (u64) U32_MAX * size)) {
- verbose(env, "the sum of R%u umin_value %llu and off %u is too big\n",
- regno, reg->umin_value, reg->off);
+ if (min_index > (u64) U32_MAX * size) {
+ verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg->umin_value);
return -ERANGE;
}
- if (check_add_overflow(reg->umax_value, reg->off, &max_index) ||
- (max_index > (u64) U32_MAX * size)) {
- verbose(env, "the sum of R%u umax_value %llu and off %u is too big\n",
- regno, reg->umax_value, reg->off);
+ if (max_index > (u64) U32_MAX * size) {
+ verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg->umax_value);
return -ERANGE;
}
@@ -21048,13 +17529,13 @@ static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *in
/* Ensure that the buffer is large enough */
if (!env->gotox_tmp_buf || env->gotox_tmp_buf->cnt < max_index - min_index + 1) {
- env->gotox_tmp_buf = iarray_realloc(env->gotox_tmp_buf,
- max_index - min_index + 1);
+ env->gotox_tmp_buf = bpf_iarray_realloc(env->gotox_tmp_buf,
+ max_index - min_index + 1);
if (!env->gotox_tmp_buf)
return -ENOMEM;
}
- n = copy_insn_array_uniq(map, min_index, max_index, env->gotox_tmp_buf->items);
+ n = bpf_copy_insn_array_uniq(map, min_index, max_index, env->gotox_tmp_buf->items);
if (n < 0)
return n;
if (n == 0) {
@@ -21070,7 +17551,7 @@ static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *in
return PTR_ERR(other_branch);
}
env->insn_idx = env->gotox_tmp_buf->items[n-1];
- return 0;
+ return INSN_IDX_UPDATED;
}
static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
@@ -21079,81 +17560,48 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
struct bpf_insn *insn = &env->prog->insnsi[env->insn_idx];
u8 class = BPF_CLASS(insn->code);
- if (class == BPF_ALU || class == BPF_ALU64) {
- err = check_alu_op(env, insn);
- if (err)
- return err;
-
- } else if (class == BPF_LDX) {
- bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX;
+ switch (class) {
+ case BPF_ALU:
+ case BPF_ALU64:
+ return check_alu_op(env, insn);
- /* Check for reserved fields is already done in
- * resolve_pseudo_ldimm64().
- */
- err = check_load_mem(env, insn, false, is_ldsx, true, "ldx");
- if (err)
- return err;
- } else if (class == BPF_STX) {
- if (BPF_MODE(insn->code) == BPF_ATOMIC) {
- err = check_atomic(env, insn);
- if (err)
- return err;
- env->insn_idx++;
- return 0;
- }
+ case BPF_LDX:
+ return check_load_mem(env, insn, false,
+ BPF_MODE(insn->code) == BPF_MEMSX,
+ true, "ldx");
- if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) {
- verbose(env, "BPF_STX uses reserved fields\n");
- return -EINVAL;
- }
+ case BPF_STX:
+ if (BPF_MODE(insn->code) == BPF_ATOMIC)
+ return check_atomic(env, insn);
+ return check_store_reg(env, insn, false);
- err = check_store_reg(env, insn, false);
- if (err)
- return err;
- } else if (class == BPF_ST) {
+ case BPF_ST: {
enum bpf_reg_type dst_reg_type;
- if (BPF_MODE(insn->code) != BPF_MEM ||
- insn->src_reg != BPF_REG_0) {
- verbose(env, "BPF_ST uses reserved fields\n");
- return -EINVAL;
- }
- /* check src operand */
err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
dst_reg_type = cur_regs(env)[insn->dst_reg].type;
- /* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, env->insn_idx, insn->dst_reg,
insn->off, BPF_SIZE(insn->code),
BPF_WRITE, -1, false, false);
if (err)
return err;
- err = save_aux_ptr_type(env, dst_reg_type, false);
- if (err)
- return err;
- } else if (class == BPF_JMP || class == BPF_JMP32) {
+ return save_aux_ptr_type(env, dst_reg_type, false);
+ }
+ case BPF_JMP:
+ case BPF_JMP32: {
u8 opcode = BPF_OP(insn->code);
env->jmps_processed++;
if (opcode == BPF_CALL) {
- if (BPF_SRC(insn->code) != BPF_K ||
- (insn->src_reg != BPF_PSEUDO_KFUNC_CALL &&
- insn->off != 0) ||
- (insn->src_reg != BPF_REG_0 &&
- insn->src_reg != BPF_PSEUDO_CALL &&
- insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
- insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) {
- verbose(env, "BPF_CALL uses reserved fields\n");
- return -EINVAL;
- }
-
if (env->cur_state->active_locks) {
if ((insn->src_reg == BPF_REG_0 &&
- insn->imm != BPF_FUNC_spin_unlock) ||
+ insn->imm != BPF_FUNC_spin_unlock &&
+ insn->imm != BPF_FUNC_kptr_xchg) ||
(insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
(insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) {
verbose(env,
@@ -21161,84 +17609,45 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
return -EINVAL;
}
}
- if (insn->src_reg == BPF_PSEUDO_CALL) {
- err = check_func_call(env, insn, &env->insn_idx);
- } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
- err = check_kfunc_call(env, insn, &env->insn_idx);
- if (!err && is_bpf_throw_kfunc(insn))
- return process_bpf_exit_full(env, do_print_state, true);
- } else {
- err = check_helper_call(env, insn, &env->insn_idx);
- }
- if (err)
- return err;
-
mark_reg_scratched(env, BPF_REG_0);
+ if (insn->src_reg == BPF_PSEUDO_CALL)
+ return check_func_call(env, insn, &env->insn_idx);
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+ return check_kfunc_call(env, insn, &env->insn_idx);
+ return check_helper_call(env, insn, &env->insn_idx);
} else if (opcode == BPF_JA) {
- if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->src_reg != BPF_REG_0 ||
- insn->imm != 0 || insn->off != 0) {
- verbose(env, "BPF_JA|BPF_X uses reserved fields\n");
- return -EINVAL;
- }
+ if (BPF_SRC(insn->code) == BPF_X)
return check_indirect_jump(env, insn);
- }
-
- if (BPF_SRC(insn->code) != BPF_K ||
- insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0 ||
- (class == BPF_JMP && insn->imm != 0) ||
- (class == BPF_JMP32 && insn->off != 0)) {
- verbose(env, "BPF_JA uses reserved fields\n");
- return -EINVAL;
- }
if (class == BPF_JMP)
env->insn_idx += insn->off + 1;
else
env->insn_idx += insn->imm + 1;
- return 0;
+ return INSN_IDX_UPDATED;
} else if (opcode == BPF_EXIT) {
- if (BPF_SRC(insn->code) != BPF_K ||
- insn->imm != 0 ||
- insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0 ||
- class == BPF_JMP32) {
- verbose(env, "BPF_EXIT uses reserved fields\n");
- return -EINVAL;
- }
return process_bpf_exit_full(env, do_print_state, false);
- } else {
- err = check_cond_jmp_op(env, insn, &env->insn_idx);
- if (err)
- return err;
}
- } else if (class == BPF_LD) {
+ return check_cond_jmp_op(env, insn, &env->insn_idx);
+ }
+ case BPF_LD: {
u8 mode = BPF_MODE(insn->code);
- if (mode == BPF_ABS || mode == BPF_IND) {
- err = check_ld_abs(env, insn);
- if (err)
- return err;
+ if (mode == BPF_ABS || mode == BPF_IND)
+ return check_ld_abs(env, insn);
- } else if (mode == BPF_IMM) {
+ if (mode == BPF_IMM) {
err = check_ld_imm(env, insn);
if (err)
return err;
env->insn_idx++;
sanitize_mark_insn_seen(env);
- } else {
- verbose(env, "invalid BPF_LD mode\n");
- return -EINVAL;
}
- } else {
- verbose(env, "unknown insn class %d\n", class);
- return -EINVAL;
+ return 0;
}
-
- env->insn_idx++;
- return 0;
+ }
+ /* all class values are handled above. silence compiler warning */
+ return -EFAULT;
}
static int do_check(struct bpf_verifier_env *env)
@@ -21253,7 +17662,7 @@ static int do_check(struct bpf_verifier_env *env)
for (;;) {
struct bpf_insn *insn;
struct bpf_insn_aux_data *insn_aux;
- int err, marks_err;
+ int err;
/* reset current history entry on each new instruction */
env->cur_hist_ent = NULL;
@@ -21278,8 +17687,8 @@ static int do_check(struct bpf_verifier_env *env)
state->last_insn_idx = env->prev_insn_idx;
state->insn_idx = env->insn_idx;
- if (is_prune_point(env, env->insn_idx)) {
- err = is_state_visited(env, env->insn_idx);
+ if (bpf_is_prune_point(env, env->insn_idx)) {
+ err = bpf_is_state_visited(env, env->insn_idx);
if (err < 0)
return err;
if (err == 1) {
@@ -21297,8 +17706,8 @@ static int do_check(struct bpf_verifier_env *env)
}
}
- if (is_jmp_point(env, env->insn_idx)) {
- err = push_jmp_history(env, state, 0, 0);
+ if (bpf_is_jmp_point(env, env->insn_idx)) {
+ err = bpf_push_jmp_history(env, state, 0, 0);
if (err)
return err;
}
@@ -21325,7 +17734,7 @@ static int do_check(struct bpf_verifier_env *env)
verbose_linfo(env, env->insn_idx, "; ");
env->prev_log_pos = env->log.end_pos;
verbose(env, "%d: ", env->insn_idx);
- verbose_insn(env, insn);
+ bpf_verbose_insn(env, insn);
env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos;
env->prev_log_pos = env->log.end_pos;
}
@@ -21340,21 +17749,34 @@ static int do_check(struct bpf_verifier_env *env)
sanitize_mark_insn_seen(env);
prev_insn_idx = env->insn_idx;
+ /* Sanity check: precomputed constants must match verifier state */
+ if (!state->speculative && insn_aux->const_reg_mask) {
+ struct bpf_reg_state *regs = cur_regs(env);
+ u16 mask = insn_aux->const_reg_mask;
+
+ for (int r = 0; r < ARRAY_SIZE(insn_aux->const_reg_vals); r++) {
+ u32 cval = insn_aux->const_reg_vals[r];
+
+ if (!(mask & BIT(r)))
+ continue;
+ if (regs[r].type != SCALAR_VALUE)
+ continue;
+ if (!tnum_is_const(regs[r].var_off))
+ continue;
+ if (verifier_bug_if((u32)regs[r].var_off.value != cval,
+ env, "const R%d: %u != %llu",
+ r, cval, regs[r].var_off.value))
+ return -EFAULT;
+ }
+ }
+
/* Reduce verification complexity by stopping speculative path
* verification when a nospec is encountered.
*/
if (state->speculative && insn_aux->nospec)
goto process_bpf_exit;
- err = bpf_reset_stack_write_marks(env, env->insn_idx);
- if (err)
- return err;
err = do_check_insn(env, &do_print_state);
- if (err >= 0 || error_recoverable_with_nospec(err)) {
- marks_err = bpf_commit_stack_write_marks(env);
- if (marks_err)
- return marks_err;
- }
if (error_recoverable_with_nospec(err) && state->speculative) {
/* Prevent this speculative path from ever reaching the
* insn that would have been unsafe to execute.
@@ -21369,8 +17791,10 @@ static int do_check(struct bpf_verifier_env *env)
return err;
} else if (err == PROCESS_BPF_EXIT) {
goto process_bpf_exit;
+ } else if (err == INSN_IDX_UPDATED) {
+ } else if (err == 0) {
+ env->insn_idx++;
}
- WARN_ON_ONCE(err);
if (state->speculative && insn_aux->nospec_result) {
/* If we are on a path that performed a jump-op, this
@@ -21394,10 +17818,7 @@ static int do_check(struct bpf_verifier_env *env)
return -EFAULT;
process_bpf_exit:
mark_verifier_state_scratched(env);
- err = update_branch_counts(env, env->cur_state);
- if (err)
- return err;
- err = bpf_update_live_stack(env);
+ err = bpf_update_branch_counts(env, env->cur_state);
if (err)
return err;
err = pop_stack(env, &prev_insn_idx, &env->insn_idx,
@@ -21790,14 +18211,199 @@ static int add_used_map(struct bpf_verifier_env *env, int fd)
return __add_used_map(env, map);
}
-/* find and rewrite pseudo imm in ld_imm64 instructions:
+static int check_alu_fields(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ u8 class = BPF_CLASS(insn->code);
+ u8 opcode = BPF_OP(insn->code);
+
+ switch (opcode) {
+ case BPF_NEG:
+ if (BPF_SRC(insn->code) != BPF_K || insn->src_reg != BPF_REG_0 ||
+ insn->off != 0 || insn->imm != 0) {
+ verbose(env, "BPF_NEG uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ case BPF_END:
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
+ (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
+ (class == BPF_ALU64 && BPF_SRC(insn->code) != BPF_TO_LE)) {
+ verbose(env, "BPF_END uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ case BPF_MOV:
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (class == BPF_ALU) {
+ if ((insn->off != 0 && insn->off != 8 && insn->off != 16) ||
+ insn->imm) {
+ verbose(env, "BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else if (insn->off == BPF_ADDR_SPACE_CAST) {
+ if (insn->imm != 1 && insn->imm != 1u << 16) {
+ verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n");
+ return -EINVAL;
+ }
+ } else if ((insn->off != 0 && insn->off != 8 &&
+ insn->off != 16 && insn->off != 32) || insn->imm) {
+ verbose(env, "BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ verbose(env, "BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ case BPF_ADD:
+ case BPF_SUB:
+ case BPF_AND:
+ case BPF_OR:
+ case BPF_XOR:
+ case BPF_LSH:
+ case BPF_RSH:
+ case BPF_ARSH:
+ case BPF_MUL:
+ case BPF_DIV:
+ case BPF_MOD:
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0 || (insn->off != 0 && insn->off != 1) ||
+ (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
+ verbose(env, "BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else if (insn->src_reg != BPF_REG_0 ||
+ (insn->off != 0 && insn->off != 1) ||
+ (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
+ verbose(env, "BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ default:
+ verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
+ return -EINVAL;
+ }
+}
+
+static int check_jmp_fields(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ u8 class = BPF_CLASS(insn->code);
+ u8 opcode = BPF_OP(insn->code);
+
+ switch (opcode) {
+ case BPF_CALL:
+ if (BPF_SRC(insn->code) != BPF_K ||
+ (insn->src_reg != BPF_PSEUDO_KFUNC_CALL && insn->off != 0) ||
+ (insn->src_reg != BPF_REG_0 && insn->src_reg != BPF_PSEUDO_CALL &&
+ insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
+ insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) {
+ verbose(env, "BPF_CALL uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ case BPF_JA:
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->src_reg != BPF_REG_0 || insn->imm != 0 || insn->off != 0) {
+ verbose(env, "BPF_JA|BPF_X uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else if (insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 ||
+ (class == BPF_JMP && insn->imm != 0) ||
+ (class == BPF_JMP32 && insn->off != 0)) {
+ verbose(env, "BPF_JA uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ case BPF_EXIT:
+ if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 ||
+ class == BPF_JMP32) {
+ verbose(env, "BPF_EXIT uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ case BPF_JCOND:
+ if (insn->code != (BPF_JMP | BPF_JCOND) || insn->src_reg != BPF_MAY_GOTO ||
+ insn->dst_reg || insn->imm) {
+ verbose(env, "invalid may_goto imm %d\n", insn->imm);
+ return -EINVAL;
+ }
+ return 0;
+ default:
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0) {
+ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else if (insn->src_reg != BPF_REG_0) {
+ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ }
+}
+
+static int check_insn_fields(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ switch (BPF_CLASS(insn->code)) {
+ case BPF_ALU:
+ case BPF_ALU64:
+ return check_alu_fields(env, insn);
+ case BPF_LDX:
+ if ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) ||
+ insn->imm != 0) {
+ verbose(env, "BPF_LDX uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ case BPF_STX:
+ if (BPF_MODE(insn->code) == BPF_ATOMIC)
+ return 0;
+ if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) {
+ verbose(env, "BPF_STX uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ case BPF_ST:
+ if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) {
+ verbose(env, "BPF_ST uses reserved fields\n");
+ return -EINVAL;
+ }
+ return 0;
+ case BPF_JMP:
+ case BPF_JMP32:
+ return check_jmp_fields(env, insn);
+ case BPF_LD: {
+ u8 mode = BPF_MODE(insn->code);
+
+ if (mode == BPF_ABS || mode == BPF_IND) {
+ if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
+ BPF_SIZE(insn->code) == BPF_DW ||
+ (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
+ verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else if (mode != BPF_IMM) {
+ verbose(env, "invalid BPF_LD mode\n");
+ return -EINVAL;
+ }
+ return 0;
+ }
+ default:
+ verbose(env, "unknown insn class %d\n", BPF_CLASS(insn->code));
+ return -EINVAL;
+ }
+}
+
+/*
+ * Check that insns are sane and rewrite pseudo imm in ld_imm64 instructions:
*
* 1. if it accesses map FD, replace it with actual map pointer.
* 2. if it accesses btf_id of a VAR, replace it with pointer to the var.
*
* NOTE: btf_vmlinux is required for converting pseudo btf_id.
*/
-static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
+static int check_and_resolve_insns(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
@@ -21808,13 +18414,14 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
return err;
for (i = 0; i < insn_cnt; i++, insn++) {
- if (BPF_CLASS(insn->code) == BPF_LDX &&
- ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) ||
- insn->imm != 0)) {
- verbose(env, "BPF_LDX uses reserved fields\n");
+ if (insn->dst_reg >= MAX_BPF_REG) {
+ verbose(env, "R%d is invalid\n", insn->dst_reg);
+ return -EINVAL;
+ }
+ if (insn->src_reg >= MAX_BPF_REG) {
+ verbose(env, "R%d is invalid\n", insn->src_reg);
return -EINVAL;
}
-
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
struct bpf_insn_aux_data *aux;
struct bpf_map *map;
@@ -21829,6 +18436,11 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
return -EINVAL;
}
+ if (insn[0].off != 0) {
+ verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
+ return -EINVAL;
+ }
+
if (insn[0].src_reg == 0)
/* valid generic load 64-bit imm */
goto next_insn;
@@ -21925,6 +18537,10 @@ next_insn:
verbose(env, "unknown opcode %02x\n", insn->code);
return -EINVAL;
}
+
+ err = check_insn_fields(env, insn);
+ if (err)
+ return err;
}
/* now all pseudo BPF_LD_IMM64 instructions load valid
@@ -21963,53 +18579,6 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
}
}
-/* single env->prog->insni[off] instruction was replaced with the range
- * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
- * [0, off) and [off, end) to new locations, so the patched range stays zero
- */
-static void adjust_insn_aux_data(struct bpf_verifier_env *env,
- struct bpf_prog *new_prog, u32 off, u32 cnt)
-{
- struct bpf_insn_aux_data *data = env->insn_aux_data;
- struct bpf_insn *insn = new_prog->insnsi;
- u32 old_seen = data[off].seen;
- u32 prog_len;
- int i;
-
- /* aux info at OFF always needs adjustment, no matter fast path
- * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the
- * original insn at old prog.
- */
- data[off].zext_dst = insn_has_def32(insn + off + cnt - 1);
-
- if (cnt == 1)
- return;
- prog_len = new_prog->len;
-
- memmove(data + off + cnt - 1, data + off,
- sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
- memset(data + off, 0, sizeof(struct bpf_insn_aux_data) * (cnt - 1));
- for (i = off; i < off + cnt - 1; i++) {
- /* Expand insni[off]'s seen count to the patched range. */
- data[i].seen = old_seen;
- data[i].zext_dst = insn_has_def32(insn + i);
- }
-}
-
-static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
-{
- int i;
-
- if (len == 1)
- return;
- /* NOTE: fake 'exit' subprog should be updated as well. */
- for (i = 0; i <= env->subprog_cnt; i++) {
- if (env->subprog_info[i].start <= off)
- continue;
- env->subprog_info[i].start += len - 1;
- }
-}
-
static void release_insn_arrays(struct bpf_verifier_env *env)
{
int i;
@@ -22018,281 +18587,7 @@ static void release_insn_arrays(struct bpf_verifier_env *env)
bpf_insn_array_release(env->insn_array_maps[i]);
}
-static void adjust_insn_arrays(struct bpf_verifier_env *env, u32 off, u32 len)
-{
- int i;
-
- if (len == 1)
- return;
-
- for (i = 0; i < env->insn_array_map_cnt; i++)
- bpf_insn_array_adjust(env->insn_array_maps[i], off, len);
-}
-
-static void adjust_insn_arrays_after_remove(struct bpf_verifier_env *env, u32 off, u32 len)
-{
- int i;
-
- for (i = 0; i < env->insn_array_map_cnt; i++)
- bpf_insn_array_adjust_after_remove(env->insn_array_maps[i], off, len);
-}
-
-static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len)
-{
- struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
- int i, sz = prog->aux->size_poke_tab;
- struct bpf_jit_poke_descriptor *desc;
-
- for (i = 0; i < sz; i++) {
- desc = &tab[i];
- if (desc->insn_idx <= off)
- continue;
- desc->insn_idx += len - 1;
- }
-}
-
-static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
- const struct bpf_insn *patch, u32 len)
-{
- struct bpf_prog *new_prog;
- struct bpf_insn_aux_data *new_data = NULL;
-
- if (len > 1) {
- new_data = vrealloc(env->insn_aux_data,
- array_size(env->prog->len + len - 1,
- sizeof(struct bpf_insn_aux_data)),
- GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!new_data)
- return NULL;
-
- env->insn_aux_data = new_data;
- }
-
- new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
- if (IS_ERR(new_prog)) {
- if (PTR_ERR(new_prog) == -ERANGE)
- verbose(env,
- "insn %d cannot be patched due to 16-bit range\n",
- env->insn_aux_data[off].orig_idx);
- return NULL;
- }
- adjust_insn_aux_data(env, new_prog, off, len);
- adjust_subprog_starts(env, off, len);
- adjust_insn_arrays(env, off, len);
- adjust_poke_descs(new_prog, off, len);
- return new_prog;
-}
-
-/*
- * For all jmp insns in a given 'prog' that point to 'tgt_idx' insn adjust the
- * jump offset by 'delta'.
- */
-static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta)
-{
- struct bpf_insn *insn = prog->insnsi;
- u32 insn_cnt = prog->len, i;
- s32 imm;
- s16 off;
-
- for (i = 0; i < insn_cnt; i++, insn++) {
- u8 code = insn->code;
-
- if (tgt_idx <= i && i < tgt_idx + delta)
- continue;
-
- if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) ||
- BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT)
- continue;
-
- if (insn->code == (BPF_JMP32 | BPF_JA)) {
- if (i + 1 + insn->imm != tgt_idx)
- continue;
- if (check_add_overflow(insn->imm, delta, &imm))
- return -ERANGE;
- insn->imm = imm;
- } else {
- if (i + 1 + insn->off != tgt_idx)
- continue;
- if (check_add_overflow(insn->off, delta, &off))
- return -ERANGE;
- insn->off = off;
- }
- }
- return 0;
-}
-
-static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
- u32 off, u32 cnt)
-{
- int i, j;
-
- /* find first prog starting at or after off (first to remove) */
- for (i = 0; i < env->subprog_cnt; i++)
- if (env->subprog_info[i].start >= off)
- break;
- /* find first prog starting at or after off + cnt (first to stay) */
- for (j = i; j < env->subprog_cnt; j++)
- if (env->subprog_info[j].start >= off + cnt)
- break;
- /* if j doesn't start exactly at off + cnt, we are just removing
- * the front of previous prog
- */
- if (env->subprog_info[j].start != off + cnt)
- j--;
-
- if (j > i) {
- struct bpf_prog_aux *aux = env->prog->aux;
- int move;
-
- /* move fake 'exit' subprog as well */
- move = env->subprog_cnt + 1 - j;
-
- memmove(env->subprog_info + i,
- env->subprog_info + j,
- sizeof(*env->subprog_info) * move);
- env->subprog_cnt -= j - i;
-
- /* remove func_info */
- if (aux->func_info) {
- move = aux->func_info_cnt - j;
-
- memmove(aux->func_info + i,
- aux->func_info + j,
- sizeof(*aux->func_info) * move);
- aux->func_info_cnt -= j - i;
- /* func_info->insn_off is set after all code rewrites,
- * in adjust_btf_func() - no need to adjust
- */
- }
- } else {
- /* convert i from "first prog to remove" to "first to adjust" */
- if (env->subprog_info[i].start == off)
- i++;
- }
-
- /* update fake 'exit' subprog as well */
- for (; i <= env->subprog_cnt; i++)
- env->subprog_info[i].start -= cnt;
-
- return 0;
-}
-
-static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off,
- u32 cnt)
-{
- struct bpf_prog *prog = env->prog;
- u32 i, l_off, l_cnt, nr_linfo;
- struct bpf_line_info *linfo;
-
- nr_linfo = prog->aux->nr_linfo;
- if (!nr_linfo)
- return 0;
-
- linfo = prog->aux->linfo;
-
- /* find first line info to remove, count lines to be removed */
- for (i = 0; i < nr_linfo; i++)
- if (linfo[i].insn_off >= off)
- break;
-
- l_off = i;
- l_cnt = 0;
- for (; i < nr_linfo; i++)
- if (linfo[i].insn_off < off + cnt)
- l_cnt++;
- else
- break;
-
- /* First live insn doesn't match first live linfo, it needs to "inherit"
- * last removed linfo. prog is already modified, so prog->len == off
- * means no live instructions after (tail of the program was removed).
- */
- if (prog->len != off && l_cnt &&
- (i == nr_linfo || linfo[i].insn_off != off + cnt)) {
- l_cnt--;
- linfo[--i].insn_off = off + cnt;
- }
-
- /* remove the line info which refer to the removed instructions */
- if (l_cnt) {
- memmove(linfo + l_off, linfo + i,
- sizeof(*linfo) * (nr_linfo - i));
-
- prog->aux->nr_linfo -= l_cnt;
- nr_linfo = prog->aux->nr_linfo;
- }
-
- /* pull all linfo[i].insn_off >= off + cnt in by cnt */
- for (i = l_off; i < nr_linfo; i++)
- linfo[i].insn_off -= cnt;
-
- /* fix up all subprogs (incl. 'exit') which start >= off */
- for (i = 0; i <= env->subprog_cnt; i++)
- if (env->subprog_info[i].linfo_idx > l_off) {
- /* program may have started in the removed region but
- * may not be fully removed
- */
- if (env->subprog_info[i].linfo_idx >= l_off + l_cnt)
- env->subprog_info[i].linfo_idx -= l_cnt;
- else
- env->subprog_info[i].linfo_idx = l_off;
- }
-
- return 0;
-}
-
-/*
- * Clean up dynamically allocated fields of aux data for instructions [start, ...]
- */
-static void clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len)
-{
- struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
- struct bpf_insn *insns = env->prog->insnsi;
- int end = start + len;
- int i;
- for (i = start; i < end; i++) {
- if (aux_data[i].jt) {
- kvfree(aux_data[i].jt);
- aux_data[i].jt = NULL;
- }
-
- if (bpf_is_ldimm64(&insns[i]))
- i++;
- }
-}
-
-static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
-{
- struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
- unsigned int orig_prog_len = env->prog->len;
- int err;
-
- if (bpf_prog_is_offloaded(env->prog->aux))
- bpf_prog_offload_remove_insns(env, off, cnt);
-
- /* Should be called before bpf_remove_insns, as it uses prog->insnsi */
- clear_insn_aux_data(env, off, cnt);
-
- err = bpf_remove_insns(env->prog, off, cnt);
- if (err)
- return err;
-
- err = adjust_subprog_starts_after_remove(env, off, cnt);
- if (err)
- return err;
-
- err = bpf_adj_linfo_after_remove(env, off, cnt);
- if (err)
- return err;
-
- adjust_insn_arrays_after_remove(env, off, cnt);
-
- memmove(aux_data + off, aux_data + off + cnt,
- sizeof(*aux_data) * (orig_prog_len - off - cnt));
-
- return 0;
-}
/* The verifier does more data flow analysis than llvm and will not
* explore branches that are dead at run time. Malicious programs can
@@ -22321,2210 +18616,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
}
}
-static bool insn_is_cond_jump(u8 code)
-{
- u8 op;
-
- op = BPF_OP(code);
- if (BPF_CLASS(code) == BPF_JMP32)
- return op != BPF_JA;
-
- if (BPF_CLASS(code) != BPF_JMP)
- return false;
-
- return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
-}
-
-static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
-{
- struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
- struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
- struct bpf_insn *insn = env->prog->insnsi;
- const int insn_cnt = env->prog->len;
- int i;
-
- for (i = 0; i < insn_cnt; i++, insn++) {
- if (!insn_is_cond_jump(insn->code))
- continue;
-
- if (!aux_data[i + 1].seen)
- ja.off = insn->off;
- else if (!aux_data[i + 1 + insn->off].seen)
- ja.off = 0;
- else
- continue;
-
- if (bpf_prog_is_offloaded(env->prog->aux))
- bpf_prog_offload_replace_insn(env, i, &ja);
-
- memcpy(insn, &ja, sizeof(ja));
- }
-}
-
-static int opt_remove_dead_code(struct bpf_verifier_env *env)
-{
- struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
- int insn_cnt = env->prog->len;
- int i, err;
-
- for (i = 0; i < insn_cnt; i++) {
- int j;
-
- j = 0;
- while (i + j < insn_cnt && !aux_data[i + j].seen)
- j++;
- if (!j)
- continue;
-
- err = verifier_remove_insns(env, i, j);
- if (err)
- return err;
- insn_cnt = env->prog->len;
- }
-
- return 0;
-}
-
-static const struct bpf_insn NOP = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
-static const struct bpf_insn MAY_GOTO_0 = BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0, 0);
-
-static int opt_remove_nops(struct bpf_verifier_env *env)
-{
- struct bpf_insn *insn = env->prog->insnsi;
- int insn_cnt = env->prog->len;
- bool is_may_goto_0, is_ja;
- int i, err;
-
- for (i = 0; i < insn_cnt; i++) {
- is_may_goto_0 = !memcmp(&insn[i], &MAY_GOTO_0, sizeof(MAY_GOTO_0));
- is_ja = !memcmp(&insn[i], &NOP, sizeof(NOP));
-
- if (!is_may_goto_0 && !is_ja)
- continue;
-
- err = verifier_remove_insns(env, i, 1);
- if (err)
- return err;
- insn_cnt--;
- /* Go back one insn to catch may_goto +1; may_goto +0 sequence */
- i -= (is_may_goto_0 && i > 0) ? 2 : 1;
- }
-
- return 0;
-}
-
-static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
- const union bpf_attr *attr)
-{
- struct bpf_insn *patch;
- /* use env->insn_buf as two independent buffers */
- struct bpf_insn *zext_patch = env->insn_buf;
- struct bpf_insn *rnd_hi32_patch = &env->insn_buf[2];
- struct bpf_insn_aux_data *aux = env->insn_aux_data;
- int i, patch_len, delta = 0, len = env->prog->len;
- struct bpf_insn *insns = env->prog->insnsi;
- struct bpf_prog *new_prog;
- bool rnd_hi32;
-
- rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32;
- zext_patch[1] = BPF_ZEXT_REG(0);
- rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0);
- rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
- rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX);
- for (i = 0; i < len; i++) {
- int adj_idx = i + delta;
- struct bpf_insn insn;
- int load_reg;
-
- insn = insns[adj_idx];
- load_reg = insn_def_regno(&insn);
- if (!aux[adj_idx].zext_dst) {
- u8 code, class;
- u32 imm_rnd;
-
- if (!rnd_hi32)
- continue;
-
- code = insn.code;
- class = BPF_CLASS(code);
- if (load_reg == -1)
- continue;
-
- /* NOTE: arg "reg" (the fourth one) is only used for
- * BPF_STX + SRC_OP, so it is safe to pass NULL
- * here.
- */
- if (is_reg64(&insn, load_reg, NULL, DST_OP)) {
- if (class == BPF_LD &&
- BPF_MODE(code) == BPF_IMM)
- i++;
- continue;
- }
-
- /* ctx load could be transformed into wider load. */
- if (class == BPF_LDX &&
- aux[adj_idx].ptr_type == PTR_TO_CTX)
- continue;
-
- imm_rnd = get_random_u32();
- rnd_hi32_patch[0] = insn;
- rnd_hi32_patch[1].imm = imm_rnd;
- rnd_hi32_patch[3].dst_reg = load_reg;
- patch = rnd_hi32_patch;
- patch_len = 4;
- goto apply_patch_buffer;
- }
-
- /* Add in an zero-extend instruction if a) the JIT has requested
- * it or b) it's a CMPXCHG.
- *
- * The latter is because: BPF_CMPXCHG always loads a value into
- * R0, therefore always zero-extends. However some archs'
- * equivalent instruction only does this load when the
- * comparison is successful. This detail of CMPXCHG is
- * orthogonal to the general zero-extension behaviour of the
- * CPU, so it's treated independently of bpf_jit_needs_zext.
- */
- if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))
- continue;
-
- /* Zero-extension is done by the caller. */
- if (bpf_pseudo_kfunc_call(&insn))
- continue;
-
- if (verifier_bug_if(load_reg == -1, env,
- "zext_dst is set, but no reg is defined"))
- return -EFAULT;
-
- zext_patch[0] = insn;
- zext_patch[1].dst_reg = load_reg;
- zext_patch[1].src_reg = load_reg;
- patch = zext_patch;
- patch_len = 2;
-apply_patch_buffer:
- new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len);
- if (!new_prog)
- return -ENOMEM;
- env->prog = new_prog;
- insns = new_prog->insnsi;
- aux = env->insn_aux_data;
- delta += patch_len - 1;
- }
-
- return 0;
-}
-
-/* convert load instructions that access fields of a context type into a
- * sequence of instructions that access fields of the underlying structure:
- * struct __sk_buff -> struct sk_buff
- * struct bpf_sock_ops -> struct sock
- */
-static int convert_ctx_accesses(struct bpf_verifier_env *env)
-{
- struct bpf_subprog_info *subprogs = env->subprog_info;
- const struct bpf_verifier_ops *ops = env->ops;
- int i, cnt, size, ctx_field_size, ret, delta = 0, epilogue_cnt = 0;
- const int insn_cnt = env->prog->len;
- struct bpf_insn *epilogue_buf = env->epilogue_buf;
- struct bpf_insn *insn_buf = env->insn_buf;
- struct bpf_insn *insn;
- u32 target_size, size_default, off;
- struct bpf_prog *new_prog;
- enum bpf_access_type type;
- bool is_narrower_load;
- int epilogue_idx = 0;
-
- if (ops->gen_epilogue) {
- epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog,
- -(subprogs[0].stack_depth + 8));
- if (epilogue_cnt >= INSN_BUF_SIZE) {
- verifier_bug(env, "epilogue is too long");
- return -EFAULT;
- } else if (epilogue_cnt) {
- /* Save the ARG_PTR_TO_CTX for the epilogue to use */
- cnt = 0;
- subprogs[0].stack_depth += 8;
- insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1,
- -subprogs[0].stack_depth);
- insn_buf[cnt++] = env->prog->insnsi[0];
- new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
- env->prog = new_prog;
- delta += cnt - 1;
-
- ret = add_kfunc_in_insns(env, epilogue_buf, epilogue_cnt - 1);
- if (ret < 0)
- return ret;
- }
- }
-
- if (ops->gen_prologue || env->seen_direct_write) {
- if (!ops->gen_prologue) {
- verifier_bug(env, "gen_prologue is null");
- return -EFAULT;
- }
- cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
- env->prog);
- if (cnt >= INSN_BUF_SIZE) {
- verifier_bug(env, "prologue is too long");
- return -EFAULT;
- } else if (cnt) {
- new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- env->prog = new_prog;
- delta += cnt - 1;
-
- ret = add_kfunc_in_insns(env, insn_buf, cnt - 1);
- if (ret < 0)
- return ret;
- }
- }
-
- if (delta)
- WARN_ON(adjust_jmp_off(env->prog, 0, delta));
-
- if (bpf_prog_is_offloaded(env->prog->aux))
- return 0;
-
- insn = env->prog->insnsi + delta;
- for (i = 0; i < insn_cnt; i++, insn++) {
- bpf_convert_ctx_access_t convert_ctx_access;
- u8 mode;
-
- if (env->insn_aux_data[i + delta].nospec) {
- WARN_ON_ONCE(env->insn_aux_data[i + delta].alu_state);
- struct bpf_insn *patch = insn_buf;
-
- *patch++ = BPF_ST_NOSPEC();
- *patch++ = *insn;
- cnt = patch - insn_buf;
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- /* This can not be easily merged with the
- * nospec_result-case, because an insn may require a
- * nospec before and after itself. Therefore also do not
- * 'continue' here but potentially apply further
- * patching to insn. *insn should equal patch[1] now.
- */
- }
-
- if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
- insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
- insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_LDX | BPF_MEM | BPF_DW) ||
- insn->code == (BPF_LDX | BPF_MEMSX | BPF_B) ||
- insn->code == (BPF_LDX | BPF_MEMSX | BPF_H) ||
- insn->code == (BPF_LDX | BPF_MEMSX | BPF_W)) {
- type = BPF_READ;
- } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_DW) ||
- insn->code == (BPF_ST | BPF_MEM | BPF_B) ||
- insn->code == (BPF_ST | BPF_MEM | BPF_H) ||
- insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
- insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
- type = BPF_WRITE;
- } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_B) ||
- insn->code == (BPF_STX | BPF_ATOMIC | BPF_H) ||
- insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) ||
- insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) &&
- env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) {
- insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code);
- env->prog->aux->num_exentries++;
- continue;
- } else if (insn->code == (BPF_JMP | BPF_EXIT) &&
- epilogue_cnt &&
- i + delta < subprogs[1].start) {
- /* Generate epilogue for the main prog */
- if (epilogue_idx) {
- /* jump back to the earlier generated epilogue */
- insn_buf[0] = BPF_JMP32_A(epilogue_idx - i - delta - 1);
- cnt = 1;
- } else {
- memcpy(insn_buf, epilogue_buf,
- epilogue_cnt * sizeof(*epilogue_buf));
- cnt = epilogue_cnt;
- /* epilogue_idx cannot be 0. It must have at
- * least one ctx ptr saving insn before the
- * epilogue.
- */
- epilogue_idx = i + delta;
- }
- goto patch_insn_buf;
- } else {
- continue;
- }
-
- if (type == BPF_WRITE &&
- env->insn_aux_data[i + delta].nospec_result) {
- /* nospec_result is only used to mitigate Spectre v4 and
- * to limit verification-time for Spectre v1.
- */
- struct bpf_insn *patch = insn_buf;
-
- *patch++ = *insn;
- *patch++ = BPF_ST_NOSPEC();
- cnt = patch - insn_buf;
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- continue;
- }
-
- switch ((int)env->insn_aux_data[i + delta].ptr_type) {
- case PTR_TO_CTX:
- if (!ops->convert_ctx_access)
- continue;
- convert_ctx_access = ops->convert_ctx_access;
- break;
- case PTR_TO_SOCKET:
- case PTR_TO_SOCK_COMMON:
- convert_ctx_access = bpf_sock_convert_ctx_access;
- break;
- case PTR_TO_TCP_SOCK:
- convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
- break;
- case PTR_TO_XDP_SOCK:
- convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
- break;
- case PTR_TO_BTF_ID:
- case PTR_TO_BTF_ID | PTR_UNTRUSTED:
- /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
- * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
- * be said once it is marked PTR_UNTRUSTED, hence we must handle
- * any faults for loads into such types. BPF_WRITE is disallowed
- * for this case.
- */
- case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
- case PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED:
- if (type == BPF_READ) {
- if (BPF_MODE(insn->code) == BPF_MEM)
- insn->code = BPF_LDX | BPF_PROBE_MEM |
- BPF_SIZE((insn)->code);
- else
- insn->code = BPF_LDX | BPF_PROBE_MEMSX |
- BPF_SIZE((insn)->code);
- env->prog->aux->num_exentries++;
- }
- continue;
- case PTR_TO_ARENA:
- if (BPF_MODE(insn->code) == BPF_MEMSX) {
- if (!bpf_jit_supports_insn(insn, true)) {
- verbose(env, "sign extending loads from arena are not supported yet\n");
- return -EOPNOTSUPP;
- }
- insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32SX | BPF_SIZE(insn->code);
- } else {
- insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code);
- }
- env->prog->aux->num_exentries++;
- continue;
- default:
- continue;
- }
-
- ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
- size = BPF_LDST_BYTES(insn);
- mode = BPF_MODE(insn->code);
-
- /* If the read access is a narrower load of the field,
- * convert to a 4/8-byte load, to minimum program type specific
- * convert_ctx_access changes. If conversion is successful,
- * we will apply proper mask to the result.
- */
- is_narrower_load = size < ctx_field_size;
- size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
- off = insn->off;
- if (is_narrower_load) {
- u8 size_code;
-
- if (type == BPF_WRITE) {
- verifier_bug(env, "narrow ctx access misconfigured");
- return -EFAULT;
- }
-
- size_code = BPF_H;
- if (ctx_field_size == 4)
- size_code = BPF_W;
- else if (ctx_field_size == 8)
- size_code = BPF_DW;
-
- insn->off = off & ~(size_default - 1);
- insn->code = BPF_LDX | BPF_MEM | size_code;
- }
-
- target_size = 0;
- cnt = convert_ctx_access(type, insn, insn_buf, env->prog,
- &target_size);
- if (cnt == 0 || cnt >= INSN_BUF_SIZE ||
- (ctx_field_size && !target_size)) {
- verifier_bug(env, "error during ctx access conversion (%d)", cnt);
- return -EFAULT;
- }
-
- if (is_narrower_load && size < target_size) {
- u8 shift = bpf_ctx_narrow_access_offset(
- off, size, size_default) * 8;
- if (shift && cnt + 1 >= INSN_BUF_SIZE) {
- verifier_bug(env, "narrow ctx load misconfigured");
- return -EFAULT;
- }
- if (ctx_field_size <= 4) {
- if (shift)
- insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
- insn->dst_reg,
- shift);
- insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
- (1 << size * 8) - 1);
- } else {
- if (shift)
- insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
- insn->dst_reg,
- shift);
- insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
- (1ULL << size * 8) - 1);
- }
- }
- if (mode == BPF_MEMSX)
- insn_buf[cnt++] = BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X,
- insn->dst_reg, insn->dst_reg,
- size * 8, 0);
-
-patch_insn_buf:
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
-
- /* keep walking new program and skip insns we just inserted */
- env->prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- }
-
- return 0;
-}
-
-static int jit_subprogs(struct bpf_verifier_env *env)
-{
- struct bpf_prog *prog = env->prog, **func, *tmp;
- int i, j, subprog_start, subprog_end = 0, len, subprog;
- struct bpf_map *map_ptr;
- struct bpf_insn *insn;
- void *old_bpf_func;
- int err, num_exentries;
- int old_len, subprog_start_adjustment = 0;
-
- if (env->subprog_cnt <= 1)
- return 0;
-
- for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
- if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn))
- continue;
-
- /* Upon error here we cannot fall back to interpreter but
- * need a hard reject of the program. Thus -EFAULT is
- * propagated in any case.
- */
- subprog = find_subprog(env, i + insn->imm + 1);
- if (verifier_bug_if(subprog < 0, env, "No program to jit at insn %d",
- i + insn->imm + 1))
- return -EFAULT;
- /* temporarily remember subprog id inside insn instead of
- * aux_data, since next loop will split up all insns into funcs
- */
- insn->off = subprog;
- /* remember original imm in case JIT fails and fallback
- * to interpreter will be needed
- */
- env->insn_aux_data[i].call_imm = insn->imm;
- /* point imm to __bpf_call_base+1 from JITs point of view */
- insn->imm = 1;
- if (bpf_pseudo_func(insn)) {
-#if defined(MODULES_VADDR)
- u64 addr = MODULES_VADDR;
-#else
- u64 addr = VMALLOC_START;
-#endif
- /* jit (e.g. x86_64) may emit fewer instructions
- * if it learns a u32 imm is the same as a u64 imm.
- * Set close enough to possible prog address.
- */
- insn[0].imm = (u32)addr;
- insn[1].imm = addr >> 32;
- }
- }
-
- err = bpf_prog_alloc_jited_linfo(prog);
- if (err)
- goto out_undo_insn;
-
- err = -ENOMEM;
- func = kzalloc_objs(prog, env->subprog_cnt);
- if (!func)
- goto out_undo_insn;
-
- for (i = 0; i < env->subprog_cnt; i++) {
- subprog_start = subprog_end;
- subprog_end = env->subprog_info[i + 1].start;
-
- len = subprog_end - subprog_start;
- /* bpf_prog_run() doesn't call subprogs directly,
- * hence main prog stats include the runtime of subprogs.
- * subprogs don't have IDs and not reachable via prog_get_next_id
- * func[i]->stats will never be accessed and stays NULL
- */
- func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
- if (!func[i])
- goto out_free;
- memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
- len * sizeof(struct bpf_insn));
- func[i]->type = prog->type;
- func[i]->len = len;
- if (bpf_prog_calc_tag(func[i]))
- goto out_free;
- func[i]->is_func = 1;
- func[i]->sleepable = prog->sleepable;
- func[i]->aux->func_idx = i;
- /* Below members will be freed only at prog->aux */
- func[i]->aux->btf = prog->aux->btf;
- func[i]->aux->subprog_start = subprog_start + subprog_start_adjustment;
- func[i]->aux->func_info = prog->aux->func_info;
- func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
- func[i]->aux->poke_tab = prog->aux->poke_tab;
- func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
- func[i]->aux->main_prog_aux = prog->aux;
-
- for (j = 0; j < prog->aux->size_poke_tab; j++) {
- struct bpf_jit_poke_descriptor *poke;
-
- poke = &prog->aux->poke_tab[j];
- if (poke->insn_idx < subprog_end &&
- poke->insn_idx >= subprog_start)
- poke->aux = func[i]->aux;
- }
-
- func[i]->aux->name[0] = 'F';
- func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
- if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE)
- func[i]->aux->jits_use_priv_stack = true;
-
- func[i]->jit_requested = 1;
- func[i]->blinding_requested = prog->blinding_requested;
- func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
- func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab;
- func[i]->aux->linfo = prog->aux->linfo;
- func[i]->aux->nr_linfo = prog->aux->nr_linfo;
- func[i]->aux->jited_linfo = prog->aux->jited_linfo;
- func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
- func[i]->aux->arena = prog->aux->arena;
- func[i]->aux->used_maps = env->used_maps;
- func[i]->aux->used_map_cnt = env->used_map_cnt;
- num_exentries = 0;
- insn = func[i]->insnsi;
- for (j = 0; j < func[i]->len; j++, insn++) {
- if (BPF_CLASS(insn->code) == BPF_LDX &&
- (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
- BPF_MODE(insn->code) == BPF_PROBE_MEM32 ||
- BPF_MODE(insn->code) == BPF_PROBE_MEM32SX ||
- BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
- num_exentries++;
- if ((BPF_CLASS(insn->code) == BPF_STX ||
- BPF_CLASS(insn->code) == BPF_ST) &&
- BPF_MODE(insn->code) == BPF_PROBE_MEM32)
- num_exentries++;
- if (BPF_CLASS(insn->code) == BPF_STX &&
- BPF_MODE(insn->code) == BPF_PROBE_ATOMIC)
- num_exentries++;
- }
- func[i]->aux->num_exentries = num_exentries;
- func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
- func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
- func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data;
- func[i]->aux->might_sleep = env->subprog_info[i].might_sleep;
- if (!i)
- func[i]->aux->exception_boundary = env->seen_exception;
-
- /*
- * To properly pass the absolute subprog start to jit
- * all instruction adjustments should be accumulated
- */
- old_len = func[i]->len;
- func[i] = bpf_int_jit_compile(func[i]);
- subprog_start_adjustment += func[i]->len - old_len;
-
- if (!func[i]->jited) {
- err = -ENOTSUPP;
- goto out_free;
- }
- cond_resched();
- }
-
- /* at this point all bpf functions were successfully JITed
- * now populate all bpf_calls with correct addresses and
- * run last pass of JIT
- */
- for (i = 0; i < env->subprog_cnt; i++) {
- insn = func[i]->insnsi;
- for (j = 0; j < func[i]->len; j++, insn++) {
- if (bpf_pseudo_func(insn)) {
- subprog = insn->off;
- insn[0].imm = (u32)(long)func[subprog]->bpf_func;
- insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
- continue;
- }
- if (!bpf_pseudo_call(insn))
- continue;
- subprog = insn->off;
- insn->imm = BPF_CALL_IMM(func[subprog]->bpf_func);
- }
-
- /* we use the aux data to keep a list of the start addresses
- * of the JITed images for each function in the program
- *
- * for some architectures, such as powerpc64, the imm field
- * might not be large enough to hold the offset of the start
- * address of the callee's JITed image from __bpf_call_base
- *
- * in such cases, we can lookup the start address of a callee
- * by using its subprog id, available from the off field of
- * the call instruction, as an index for this list
- */
- func[i]->aux->func = func;
- func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
- func[i]->aux->real_func_cnt = env->subprog_cnt;
- }
- for (i = 0; i < env->subprog_cnt; i++) {
- old_bpf_func = func[i]->bpf_func;
- tmp = bpf_int_jit_compile(func[i]);
- if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
- verbose(env, "JIT doesn't support bpf-to-bpf calls\n");
- err = -ENOTSUPP;
- goto out_free;
- }
- cond_resched();
- }
-
- /*
- * Cleanup func[i]->aux fields which aren't required
- * or can become invalid in future
- */
- for (i = 0; i < env->subprog_cnt; i++) {
- func[i]->aux->used_maps = NULL;
- func[i]->aux->used_map_cnt = 0;
- }
-
- /* finally lock prog and jit images for all functions and
- * populate kallsysm. Begin at the first subprogram, since
- * bpf_prog_load will add the kallsyms for the main program.
- */
- for (i = 1; i < env->subprog_cnt; i++) {
- err = bpf_prog_lock_ro(func[i]);
- if (err)
- goto out_free;
- }
-
- for (i = 1; i < env->subprog_cnt; i++)
- bpf_prog_kallsyms_add(func[i]);
-
- /* Last step: make now unused interpreter insns from main
- * prog consistent for later dump requests, so they can
- * later look the same as if they were interpreted only.
- */
- for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
- if (bpf_pseudo_func(insn)) {
- insn[0].imm = env->insn_aux_data[i].call_imm;
- insn[1].imm = insn->off;
- insn->off = 0;
- continue;
- }
- if (!bpf_pseudo_call(insn))
- continue;
- insn->off = env->insn_aux_data[i].call_imm;
- subprog = find_subprog(env, i + insn->off + 1);
- insn->imm = subprog;
- }
-
- prog->jited = 1;
- prog->bpf_func = func[0]->bpf_func;
- prog->jited_len = func[0]->jited_len;
- prog->aux->extable = func[0]->aux->extable;
- prog->aux->num_exentries = func[0]->aux->num_exentries;
- prog->aux->func = func;
- prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
- prog->aux->real_func_cnt = env->subprog_cnt;
- prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func;
- prog->aux->exception_boundary = func[0]->aux->exception_boundary;
- bpf_prog_jit_attempt_done(prog);
- return 0;
-out_free:
- /* We failed JIT'ing, so at this point we need to unregister poke
- * descriptors from subprogs, so that kernel is not attempting to
- * patch it anymore as we're freeing the subprog JIT memory.
- */
- for (i = 0; i < prog->aux->size_poke_tab; i++) {
- map_ptr = prog->aux->poke_tab[i].tail_call.map;
- map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
- }
- /* At this point we're guaranteed that poke descriptors are not
- * live anymore. We can just unlink its descriptor table as it's
- * released with the main prog.
- */
- for (i = 0; i < env->subprog_cnt; i++) {
- if (!func[i])
- continue;
- func[i]->aux->poke_tab = NULL;
- bpf_jit_free(func[i]);
- }
- kfree(func);
-out_undo_insn:
- /* cleanup main prog to be interpreted */
- prog->jit_requested = 0;
- prog->blinding_requested = 0;
- for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
- if (!bpf_pseudo_call(insn))
- continue;
- insn->off = 0;
- insn->imm = env->insn_aux_data[i].call_imm;
- }
- bpf_prog_jit_attempt_done(prog);
- return err;
-}
-
-static int fixup_call_args(struct bpf_verifier_env *env)
-{
-#ifndef CONFIG_BPF_JIT_ALWAYS_ON
- struct bpf_prog *prog = env->prog;
- struct bpf_insn *insn = prog->insnsi;
- bool has_kfunc_call = bpf_prog_has_kfunc_call(prog);
- int i, depth;
-#endif
- int err = 0;
-
- if (env->prog->jit_requested &&
- !bpf_prog_is_offloaded(env->prog->aux)) {
- err = jit_subprogs(env);
- if (err == 0)
- return 0;
- if (err == -EFAULT)
- return err;
- }
-#ifndef CONFIG_BPF_JIT_ALWAYS_ON
- if (has_kfunc_call) {
- verbose(env, "calling kernel functions are not allowed in non-JITed programs\n");
- return -EINVAL;
- }
- if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
- /* When JIT fails the progs with bpf2bpf calls and tail_calls
- * have to be rejected, since interpreter doesn't support them yet.
- */
- verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
- return -EINVAL;
- }
- for (i = 0; i < prog->len; i++, insn++) {
- if (bpf_pseudo_func(insn)) {
- /* When JIT fails the progs with callback calls
- * have to be rejected, since interpreter doesn't support them yet.
- */
- verbose(env, "callbacks are not allowed in non-JITed programs\n");
- return -EINVAL;
- }
-
- if (!bpf_pseudo_call(insn))
- continue;
- depth = get_callee_stack_depth(env, insn, i);
- if (depth < 0)
- return depth;
- bpf_patch_call_args(insn, depth);
- }
- err = 0;
-#endif
- return err;
-}
-
-/* replace a generic kfunc with a specialized version if necessary */
-static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, int insn_idx)
-{
- struct bpf_prog *prog = env->prog;
- bool seen_direct_write;
- void *xdp_kfunc;
- bool is_rdonly;
- u32 func_id = desc->func_id;
- u16 offset = desc->offset;
- unsigned long addr = desc->addr;
-
- if (offset) /* return if module BTF is used */
- return 0;
-
- if (bpf_dev_bound_kfunc_id(func_id)) {
- xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id);
- if (xdp_kfunc)
- addr = (unsigned long)xdp_kfunc;
- /* fallback to default kfunc when not supported by netdev */
- } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
- seen_direct_write = env->seen_direct_write;
- is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
-
- if (is_rdonly)
- addr = (unsigned long)bpf_dynptr_from_skb_rdonly;
-
- /* restore env->seen_direct_write to its original value, since
- * may_access_direct_pkt_data mutates it
- */
- env->seen_direct_write = seen_direct_write;
- } else if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr]) {
- if (bpf_lsm_has_d_inode_locked(prog))
- addr = (unsigned long)bpf_set_dentry_xattr_locked;
- } else if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr]) {
- if (bpf_lsm_has_d_inode_locked(prog))
- addr = (unsigned long)bpf_remove_dentry_xattr_locked;
- } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
- if (!env->insn_aux_data[insn_idx].non_sleepable)
- addr = (unsigned long)bpf_dynptr_from_file_sleepable;
- } else if (func_id == special_kfunc_list[KF_bpf_arena_alloc_pages]) {
- if (env->insn_aux_data[insn_idx].non_sleepable)
- addr = (unsigned long)bpf_arena_alloc_pages_non_sleepable;
- } else if (func_id == special_kfunc_list[KF_bpf_arena_free_pages]) {
- if (env->insn_aux_data[insn_idx].non_sleepable)
- addr = (unsigned long)bpf_arena_free_pages_non_sleepable;
- }
- desc->addr = addr;
- return 0;
-}
-
-static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
- u16 struct_meta_reg,
- u16 node_offset_reg,
- struct bpf_insn *insn,
- struct bpf_insn *insn_buf,
- int *cnt)
-{
- struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta;
- struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) };
-
- insn_buf[0] = addr[0];
- insn_buf[1] = addr[1];
- insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off);
- insn_buf[3] = *insn;
- *cnt = 4;
-}
-
-static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
- struct bpf_insn *insn_buf, int insn_idx, int *cnt)
-{
- struct bpf_kfunc_desc *desc;
- int err;
-
- if (!insn->imm) {
- verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
- return -EINVAL;
- }
-
- *cnt = 0;
-
- /* insn->imm has the btf func_id. Replace it with an offset relative to
- * __bpf_call_base, unless the JIT needs to call functions that are
- * further than 32 bits away (bpf_jit_supports_far_kfunc_call()).
- */
- desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
- if (!desc) {
- verifier_bug(env, "kernel function descriptor not found for func_id %u",
- insn->imm);
- return -EFAULT;
- }
-
- err = specialize_kfunc(env, desc, insn_idx);
- if (err)
- return err;
-
- if (!bpf_jit_supports_far_kfunc_call())
- insn->imm = BPF_CALL_IMM(desc->addr);
-
- if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
- desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
- struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
- struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
- u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;
-
- if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) {
- verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d",
- insn_idx);
- return -EFAULT;
- }
-
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size);
- insn_buf[1] = addr[0];
- insn_buf[2] = addr[1];
- insn_buf[3] = *insn;
- *cnt = 4;
- } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
- desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] ||
- desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
- struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
- struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
-
- if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) {
- verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d",
- insn_idx);
- return -EFAULT;
- }
-
- if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
- !kptr_struct_meta) {
- verifier_bug(env, "kptr_struct_meta expected at insn_idx %d",
- insn_idx);
- return -EFAULT;
- }
-
- insn_buf[0] = addr[0];
- insn_buf[1] = addr[1];
- insn_buf[2] = *insn;
- *cnt = 3;
- } else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
- desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
- desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
- struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
- int struct_meta_reg = BPF_REG_3;
- int node_offset_reg = BPF_REG_4;
-
- /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
- if (desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
- struct_meta_reg = BPF_REG_4;
- node_offset_reg = BPF_REG_5;
- }
-
- if (!kptr_struct_meta) {
- verifier_bug(env, "kptr_struct_meta expected at insn_idx %d",
- insn_idx);
- return -EFAULT;
- }
-
- __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
- node_offset_reg, insn, insn_buf, cnt);
- } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
- desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
- insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
- *cnt = 1;
- } else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] &&
- env->prog->expected_attach_type == BPF_TRACE_FSESSION) {
- /*
- * inline the bpf_session_is_return() for fsession:
- * bool bpf_session_is_return(void *ctx)
- * {
- * return (((u64 *)ctx)[-1] >> BPF_TRAMP_IS_RETURN_SHIFT) & 1;
- * }
- */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
- insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_IS_RETURN_SHIFT);
- insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1);
- *cnt = 3;
- } else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] &&
- env->prog->expected_attach_type == BPF_TRACE_FSESSION) {
- /*
- * inline bpf_session_cookie() for fsession:
- * __u64 *bpf_session_cookie(void *ctx)
- * {
- * u64 off = (((u64 *)ctx)[-1] >> BPF_TRAMP_COOKIE_INDEX_SHIFT) & 0xFF;
- * return &((u64 *)ctx)[-off];
- * }
- */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
- insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_COOKIE_INDEX_SHIFT);
- insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
- insn_buf[3] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
- insn_buf[4] = BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1);
- insn_buf[5] = BPF_ALU64_IMM(BPF_NEG, BPF_REG_0, 0);
- *cnt = 6;
- }
-
- if (env->insn_aux_data[insn_idx].arg_prog) {
- u32 regno = env->insn_aux_data[insn_idx].arg_prog;
- struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) };
- int idx = *cnt;
-
- insn_buf[idx++] = ld_addrs[0];
- insn_buf[idx++] = ld_addrs[1];
- insn_buf[idx++] = *insn;
- *cnt = idx;
- }
- return 0;
-}
-
-/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
-static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
-{
- struct bpf_subprog_info *info = env->subprog_info;
- int cnt = env->subprog_cnt;
- struct bpf_prog *prog;
-
- /* We only reserve one slot for hidden subprogs in subprog_info. */
- if (env->hidden_subprog_cnt) {
- verifier_bug(env, "only one hidden subprog supported");
- return -EFAULT;
- }
- /* We're not patching any existing instruction, just appending the new
- * ones for the hidden subprog. Hence all of the adjustment operations
- * in bpf_patch_insn_data are no-ops.
- */
- prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len);
- if (!prog)
- return -ENOMEM;
- env->prog = prog;
- info[cnt + 1].start = info[cnt].start;
- info[cnt].start = prog->len - len + 1;
- env->subprog_cnt++;
- env->hidden_subprog_cnt++;
- return 0;
-}
-
-/* Do various post-verification rewrites in a single program pass.
- * These rewrites simplify JIT and interpreter implementations.
- */
-static int do_misc_fixups(struct bpf_verifier_env *env)
-{
- struct bpf_prog *prog = env->prog;
- enum bpf_attach_type eatype = prog->expected_attach_type;
- enum bpf_prog_type prog_type = resolve_prog_type(prog);
- struct bpf_insn *insn = prog->insnsi;
- const struct bpf_func_proto *fn;
- const int insn_cnt = prog->len;
- const struct bpf_map_ops *ops;
- struct bpf_insn_aux_data *aux;
- struct bpf_insn *insn_buf = env->insn_buf;
- struct bpf_prog *new_prog;
- struct bpf_map *map_ptr;
- int i, ret, cnt, delta = 0, cur_subprog = 0;
- struct bpf_subprog_info *subprogs = env->subprog_info;
- u16 stack_depth = subprogs[cur_subprog].stack_depth;
- u16 stack_depth_extra = 0;
-
- if (env->seen_exception && !env->exception_callback_subprog) {
- struct bpf_insn *patch = insn_buf;
-
- *patch++ = env->prog->insnsi[insn_cnt - 1];
- *patch++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
- *patch++ = BPF_EXIT_INSN();
- ret = add_hidden_subprog(env, insn_buf, patch - insn_buf);
- if (ret < 0)
- return ret;
- prog = env->prog;
- insn = prog->insnsi;
-
- env->exception_callback_subprog = env->subprog_cnt - 1;
- /* Don't update insn_cnt, as add_hidden_subprog always appends insns */
- mark_subprog_exc_cb(env, env->exception_callback_subprog);
- }
-
- for (i = 0; i < insn_cnt;) {
- if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) {
- if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) ||
- (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) {
- /* convert to 32-bit mov that clears upper 32-bit */
- insn->code = BPF_ALU | BPF_MOV | BPF_X;
- /* clear off and imm, so it's a normal 'wX = wY' from JIT pov */
- insn->off = 0;
- insn->imm = 0;
- } /* cast from as(0) to as(1) should be handled by JIT */
- goto next_insn;
- }
-
- if (env->insn_aux_data[i + delta].needs_zext)
- /* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */
- insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code);
-
- /* Make sdiv/smod divide-by-minus-one exceptions impossible. */
- if ((insn->code == (BPF_ALU64 | BPF_MOD | BPF_K) ||
- insn->code == (BPF_ALU64 | BPF_DIV | BPF_K) ||
- insn->code == (BPF_ALU | BPF_MOD | BPF_K) ||
- insn->code == (BPF_ALU | BPF_DIV | BPF_K)) &&
- insn->off == 1 && insn->imm == -1) {
- bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
- bool isdiv = BPF_OP(insn->code) == BPF_DIV;
- struct bpf_insn *patch = insn_buf;
-
- if (isdiv)
- *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
- BPF_NEG | BPF_K, insn->dst_reg,
- 0, 0, 0);
- else
- *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0);
-
- cnt = patch - insn_buf;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Make divide-by-zero and divide-by-minus-one exceptions impossible. */
- if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
- insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
- insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
- insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
- bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
- bool isdiv = BPF_OP(insn->code) == BPF_DIV;
- bool is_sdiv = isdiv && insn->off == 1;
- bool is_smod = !isdiv && insn->off == 1;
- struct bpf_insn *patch = insn_buf;
-
- if (is_sdiv) {
- /* [R,W]x sdiv 0 -> 0
- * LLONG_MIN sdiv -1 -> LLONG_MIN
- * INT_MIN sdiv -1 -> INT_MIN
- */
- *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
- *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
- BPF_ADD | BPF_K, BPF_REG_AX,
- 0, 0, 1);
- *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JGT | BPF_K, BPF_REG_AX,
- 0, 4, 1);
- *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JEQ | BPF_K, BPF_REG_AX,
- 0, 1, 0);
- *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
- BPF_MOV | BPF_K, insn->dst_reg,
- 0, 0, 0);
- /* BPF_NEG(LLONG_MIN) == -LLONG_MIN == LLONG_MIN */
- *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
- BPF_NEG | BPF_K, insn->dst_reg,
- 0, 0, 0);
- *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
- *patch++ = *insn;
- cnt = patch - insn_buf;
- } else if (is_smod) {
- /* [R,W]x mod 0 -> [R,W]x */
- /* [R,W]x mod -1 -> 0 */
- *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
- *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
- BPF_ADD | BPF_K, BPF_REG_AX,
- 0, 0, 1);
- *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JGT | BPF_K, BPF_REG_AX,
- 0, 3, 1);
- *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JEQ | BPF_K, BPF_REG_AX,
- 0, 3 + (is64 ? 0 : 1), 1);
- *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0);
- *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
- *patch++ = *insn;
-
- if (!is64) {
- *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
- *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg);
- }
- cnt = patch - insn_buf;
- } else if (isdiv) {
- /* [R,W]x div 0 -> 0 */
- *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JNE | BPF_K, insn->src_reg,
- 0, 2, 0);
- *patch++ = BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg);
- *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
- *patch++ = *insn;
- cnt = patch - insn_buf;
- } else {
- /* [R,W]x mod 0 -> [R,W]x */
- *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JEQ | BPF_K, insn->src_reg,
- 0, 1 + (is64 ? 0 : 1), 0);
- *patch++ = *insn;
-
- if (!is64) {
- *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
- *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg);
- }
- cnt = patch - insn_buf;
- }
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Make it impossible to de-reference a userspace address */
- if (BPF_CLASS(insn->code) == BPF_LDX &&
- (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
- BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
- struct bpf_insn *patch = insn_buf;
- u64 uaddress_limit = bpf_arch_uaddress_limit();
-
- if (!uaddress_limit)
- goto next_insn;
-
- *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
- if (insn->off)
- *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
- *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
- *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
- *patch++ = *insn;
- *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
- *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
-
- cnt = patch - insn_buf;
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
- if (BPF_CLASS(insn->code) == BPF_LD &&
- (BPF_MODE(insn->code) == BPF_ABS ||
- BPF_MODE(insn->code) == BPF_IND)) {
- cnt = env->ops->gen_ld_abs(insn, insn_buf);
- if (cnt == 0 || cnt >= INSN_BUF_SIZE) {
- verifier_bug(env, "%d insns generated for ld_abs", cnt);
- return -EFAULT;
- }
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Rewrite pointer arithmetic to mitigate speculation attacks. */
- if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
- insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
- const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
- const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
- struct bpf_insn *patch = insn_buf;
- bool issrc, isneg, isimm;
- u32 off_reg;
-
- aux = &env->insn_aux_data[i + delta];
- if (!aux->alu_state ||
- aux->alu_state == BPF_ALU_NON_POINTER)
- goto next_insn;
-
- isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
- issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
- BPF_ALU_SANITIZE_SRC;
- isimm = aux->alu_state & BPF_ALU_IMMEDIATE;
-
- off_reg = issrc ? insn->src_reg : insn->dst_reg;
- if (isimm) {
- *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
- } else {
- if (isneg)
- *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
- *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
- *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
- *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
- *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
- *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
- *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg);
- }
- if (!issrc)
- *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg);
- insn->src_reg = BPF_REG_AX;
- if (isneg)
- insn->code = insn->code == code_add ?
- code_sub : code_add;
- *patch++ = *insn;
- if (issrc && isneg && !isimm)
- *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
- cnt = patch - insn_buf;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) {
- int stack_off_cnt = -stack_depth - 16;
-
- /*
- * Two 8 byte slots, depth-16 stores the count, and
- * depth-8 stores the start timestamp of the loop.
- *
- * The starting value of count is BPF_MAX_TIMED_LOOPS
- * (0xffff). Every iteration loads it and subs it by 1,
- * until the value becomes 0 in AX (thus, 1 in stack),
- * after which we call arch_bpf_timed_may_goto, which
- * either sets AX to 0xffff to keep looping, or to 0
- * upon timeout. AX is then stored into the stack. In
- * the next iteration, we either see 0 and break out, or
- * continue iterating until the next time value is 0
- * after subtraction, rinse and repeat.
- */
- stack_depth_extra = 16;
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt);
- if (insn->off >= 0)
- insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5);
- else
- insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
- insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
- insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2);
- /*
- * AX is used as an argument to pass in stack_off_cnt
- * (to add to r10/fp), and also as the return value of
- * the call to arch_bpf_timed_may_goto.
- */
- insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt);
- insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto);
- insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt);
- cnt = 7;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- } else if (is_may_goto_insn(insn)) {
- int stack_off = -stack_depth - 8;
-
- stack_depth_extra = 8;
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off);
- if (insn->off >= 0)
- insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2);
- else
- insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
- insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
- insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off);
- cnt = 4;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- if (insn->code != (BPF_JMP | BPF_CALL))
- goto next_insn;
- if (insn->src_reg == BPF_PSEUDO_CALL)
- goto next_insn;
- if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
- ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
- if (ret)
- return ret;
- if (cnt == 0)
- goto next_insn;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Skip inlining the helper call if the JIT does it. */
- if (bpf_jit_inlines_helper_call(insn->imm))
- goto next_insn;
-
- if (insn->imm == BPF_FUNC_get_route_realm)
- prog->dst_needed = 1;
- if (insn->imm == BPF_FUNC_get_prandom_u32)
- bpf_user_rnd_init_once();
- if (insn->imm == BPF_FUNC_override_return)
- prog->kprobe_override = 1;
- if (insn->imm == BPF_FUNC_tail_call) {
- /* If we tail call into other programs, we
- * cannot make any assumptions since they can
- * be replaced dynamically during runtime in
- * the program array.
- */
- prog->cb_access = 1;
- if (!allow_tail_call_in_subprogs(env))
- prog->aux->stack_depth = MAX_BPF_STACK;
- prog->aux->max_pkt_offset = MAX_PACKET_OFF;
-
- /* mark bpf_tail_call as different opcode to avoid
- * conditional branch in the interpreter for every normal
- * call and to prevent accidental JITing by JIT compiler
- * that doesn't support bpf_tail_call yet
- */
- insn->imm = 0;
- insn->code = BPF_JMP | BPF_TAIL_CALL;
-
- aux = &env->insn_aux_data[i + delta];
- if (env->bpf_capable && !prog->blinding_requested &&
- prog->jit_requested &&
- !bpf_map_key_poisoned(aux) &&
- !bpf_map_ptr_poisoned(aux) &&
- !bpf_map_ptr_unpriv(aux)) {
- struct bpf_jit_poke_descriptor desc = {
- .reason = BPF_POKE_REASON_TAIL_CALL,
- .tail_call.map = aux->map_ptr_state.map_ptr,
- .tail_call.key = bpf_map_key_immediate(aux),
- .insn_idx = i + delta,
- };
-
- ret = bpf_jit_add_poke_descriptor(prog, &desc);
- if (ret < 0) {
- verbose(env, "adding tail call poke descriptor failed\n");
- return ret;
- }
-
- insn->imm = ret + 1;
- goto next_insn;
- }
-
- if (!bpf_map_ptr_unpriv(aux))
- goto next_insn;
-
- /* instead of changing every JIT dealing with tail_call
- * emit two extra insns:
- * if (index >= max_entries) goto out;
- * index &= array->index_mask;
- * to avoid out-of-bounds cpu speculation
- */
- if (bpf_map_ptr_poisoned(aux)) {
- verbose(env, "tail_call abusing map_ptr\n");
- return -EINVAL;
- }
-
- map_ptr = aux->map_ptr_state.map_ptr;
- insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
- map_ptr->max_entries, 2);
- insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
- container_of(map_ptr,
- struct bpf_array,
- map)->index_mask);
- insn_buf[2] = *insn;
- cnt = 3;
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- if (insn->imm == BPF_FUNC_timer_set_callback) {
- /* The verifier will process callback_fn as many times as necessary
- * with different maps and the register states prepared by
- * set_timer_callback_state will be accurate.
- *
- * The following use case is valid:
- * map1 is shared by prog1, prog2, prog3.
- * prog1 calls bpf_timer_init for some map1 elements
- * prog2 calls bpf_timer_set_callback for some map1 elements.
- * Those that were not bpf_timer_init-ed will return -EINVAL.
- * prog3 calls bpf_timer_start for some map1 elements.
- * Those that were not both bpf_timer_init-ed and
- * bpf_timer_set_callback-ed will return -EINVAL.
- */
- struct bpf_insn ld_addrs[2] = {
- BPF_LD_IMM64(BPF_REG_3, (long)prog->aux),
- };
-
- insn_buf[0] = ld_addrs[0];
- insn_buf[1] = ld_addrs[1];
- insn_buf[2] = *insn;
- cnt = 3;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto patch_call_imm;
- }
-
- if (is_storage_get_function(insn->imm)) {
- if (env->insn_aux_data[i + delta].non_sleepable)
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
- else
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
- insn_buf[1] = *insn;
- cnt = 2;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto patch_call_imm;
- }
-
- /* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */
- if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) {
- /* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data,
- * bpf_mem_alloc() returns a ptr to the percpu data ptr.
- */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
- insn_buf[1] = *insn;
- cnt = 2;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto patch_call_imm;
- }
-
- /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
- * and other inlining handlers are currently limited to 64 bit
- * only.
- */
- if (prog->jit_requested && BITS_PER_LONG == 64 &&
- (insn->imm == BPF_FUNC_map_lookup_elem ||
- insn->imm == BPF_FUNC_map_update_elem ||
- insn->imm == BPF_FUNC_map_delete_elem ||
- insn->imm == BPF_FUNC_map_push_elem ||
- insn->imm == BPF_FUNC_map_pop_elem ||
- insn->imm == BPF_FUNC_map_peek_elem ||
- insn->imm == BPF_FUNC_redirect_map ||
- insn->imm == BPF_FUNC_for_each_map_elem ||
- insn->imm == BPF_FUNC_map_lookup_percpu_elem)) {
- aux = &env->insn_aux_data[i + delta];
- if (bpf_map_ptr_poisoned(aux))
- goto patch_call_imm;
-
- map_ptr = aux->map_ptr_state.map_ptr;
- ops = map_ptr->ops;
- if (insn->imm == BPF_FUNC_map_lookup_elem &&
- ops->map_gen_lookup) {
- cnt = ops->map_gen_lookup(map_ptr, insn_buf);
- if (cnt == -EOPNOTSUPP)
- goto patch_map_ops_generic;
- if (cnt <= 0 || cnt >= INSN_BUF_SIZE) {
- verifier_bug(env, "%d insns generated for map lookup", cnt);
- return -EFAULT;
- }
-
- new_prog = bpf_patch_insn_data(env, i + delta,
- insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
- (void *(*)(struct bpf_map *map, void *key))NULL));
- BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
- (long (*)(struct bpf_map *map, void *key))NULL));
- BUILD_BUG_ON(!__same_type(ops->map_update_elem,
- (long (*)(struct bpf_map *map, void *key, void *value,
- u64 flags))NULL));
- BUILD_BUG_ON(!__same_type(ops->map_push_elem,
- (long (*)(struct bpf_map *map, void *value,
- u64 flags))NULL));
- BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
- (long (*)(struct bpf_map *map, void *value))NULL));
- BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
- (long (*)(struct bpf_map *map, void *value))NULL));
- BUILD_BUG_ON(!__same_type(ops->map_redirect,
- (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
- BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
- (long (*)(struct bpf_map *map,
- bpf_callback_t callback_fn,
- void *callback_ctx,
- u64 flags))NULL));
- BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem,
- (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL));
-
-patch_map_ops_generic:
- switch (insn->imm) {
- case BPF_FUNC_map_lookup_elem:
- insn->imm = BPF_CALL_IMM(ops->map_lookup_elem);
- goto next_insn;
- case BPF_FUNC_map_update_elem:
- insn->imm = BPF_CALL_IMM(ops->map_update_elem);
- goto next_insn;
- case BPF_FUNC_map_delete_elem:
- insn->imm = BPF_CALL_IMM(ops->map_delete_elem);
- goto next_insn;
- case BPF_FUNC_map_push_elem:
- insn->imm = BPF_CALL_IMM(ops->map_push_elem);
- goto next_insn;
- case BPF_FUNC_map_pop_elem:
- insn->imm = BPF_CALL_IMM(ops->map_pop_elem);
- goto next_insn;
- case BPF_FUNC_map_peek_elem:
- insn->imm = BPF_CALL_IMM(ops->map_peek_elem);
- goto next_insn;
- case BPF_FUNC_redirect_map:
- insn->imm = BPF_CALL_IMM(ops->map_redirect);
- goto next_insn;
- case BPF_FUNC_for_each_map_elem:
- insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
- goto next_insn;
- case BPF_FUNC_map_lookup_percpu_elem:
- insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem);
- goto next_insn;
- }
-
- goto patch_call_imm;
- }
-
- /* Implement bpf_jiffies64 inline. */
- if (prog->jit_requested && BITS_PER_LONG == 64 &&
- insn->imm == BPF_FUNC_jiffies64) {
- struct bpf_insn ld_jiffies_addr[2] = {
- BPF_LD_IMM64(BPF_REG_0,
- (unsigned long)&jiffies),
- };
-
- insn_buf[0] = ld_jiffies_addr[0];
- insn_buf[1] = ld_jiffies_addr[1];
- insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0,
- BPF_REG_0, 0);
- cnt = 3;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
- cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
-#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
- /* Implement bpf_get_smp_processor_id() inline. */
- if (insn->imm == BPF_FUNC_get_smp_processor_id &&
- verifier_inlines_helper_call(env, insn->imm)) {
- /* BPF_FUNC_get_smp_processor_id inlining is an
- * optimization, so if cpu_number is ever
- * changed in some incompatible and hard to support
- * way, it's fine to back out this inlining logic
- */
-#ifdef CONFIG_SMP
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&cpu_number);
- insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
- insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0);
- cnt = 3;
-#else
- insn_buf[0] = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
- cnt = 1;
-#endif
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Implement bpf_get_current_task() and bpf_get_current_task_btf() inline. */
- if ((insn->imm == BPF_FUNC_get_current_task || insn->imm == BPF_FUNC_get_current_task_btf) &&
- verifier_inlines_helper_call(env, insn->imm)) {
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&current_task);
- insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
- insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
- cnt = 3;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-#endif
- /* Implement bpf_get_func_arg inline. */
- if (prog_type == BPF_PROG_TYPE_TRACING &&
- insn->imm == BPF_FUNC_get_func_arg) {
- if (eatype == BPF_TRACE_RAW_TP) {
- int nr_args = btf_type_vlen(prog->aux->attach_func_proto);
-
- /* skip 'void *__data' in btf_trace_##name() and save to reg0 */
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1);
- cnt = 1;
- } else {
- /* Load nr_args from ctx - 8 */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
- insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
- cnt = 2;
- }
- insn_buf[cnt++] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6);
- insn_buf[cnt++] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3);
- insn_buf[cnt++] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1);
- insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0);
- insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
- insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, 0);
- insn_buf[cnt++] = BPF_JMP_A(1);
- insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Implement bpf_get_func_ret inline. */
- if (prog_type == BPF_PROG_TYPE_TRACING &&
- insn->imm == BPF_FUNC_get_func_ret) {
- if (eatype == BPF_TRACE_FEXIT ||
- eatype == BPF_TRACE_FSESSION ||
- eatype == BPF_MODIFY_RETURN) {
- /* Load nr_args from ctx - 8 */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
- insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
- insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
- insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
- insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
- insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0);
- insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0);
- cnt = 7;
- } else {
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP);
- cnt = 1;
- }
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Implement get_func_arg_cnt inline. */
- if (prog_type == BPF_PROG_TYPE_TRACING &&
- insn->imm == BPF_FUNC_get_func_arg_cnt) {
- if (eatype == BPF_TRACE_RAW_TP) {
- int nr_args = btf_type_vlen(prog->aux->attach_func_proto);
-
- /* skip 'void *__data' in btf_trace_##name() and save to reg0 */
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1);
- cnt = 1;
- } else {
- /* Load nr_args from ctx - 8 */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
- insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
- cnt = 2;
- }
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Implement bpf_get_func_ip inline. */
- if (prog_type == BPF_PROG_TYPE_TRACING &&
- insn->imm == BPF_FUNC_get_func_ip) {
- /* Load IP address from ctx - 16 */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16);
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
- if (!new_prog)
- return -ENOMEM;
-
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Implement bpf_get_branch_snapshot inline. */
- if (IS_ENABLED(CONFIG_PERF_EVENTS) &&
- prog->jit_requested && BITS_PER_LONG == 64 &&
- insn->imm == BPF_FUNC_get_branch_snapshot) {
- /* We are dealing with the following func protos:
- * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags);
- * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt);
- */
- const u32 br_entry_size = sizeof(struct perf_branch_entry);
-
- /* struct perf_branch_entry is part of UAPI and is
- * used as an array element, so extremely unlikely to
- * ever grow or shrink
- */
- BUILD_BUG_ON(br_entry_size != 24);
-
- /* if (unlikely(flags)) return -EINVAL */
- insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7);
-
- /* Transform size (bytes) into number of entries (cnt = size / 24).
- * But to avoid expensive division instruction, we implement
- * divide-by-3 through multiplication, followed by further
- * division by 8 through 3-bit right shift.
- * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr.,
- * p. 227, chapter "Unsigned Division by 3" for details and proofs.
- *
- * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab.
- */
- insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab);
- insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0);
- insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36);
-
- /* call perf_snapshot_branch_stack implementation */
- insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack));
- /* if (entry_cnt == 0) return -ENOENT */
- insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4);
- /* return entry_cnt * sizeof(struct perf_branch_entry) */
- insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size);
- insn_buf[7] = BPF_JMP_A(3);
- /* return -EINVAL; */
- insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
- insn_buf[9] = BPF_JMP_A(1);
- /* return -ENOENT; */
- insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT);
- cnt = 11;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-
- /* Implement bpf_kptr_xchg inline */
- if (prog->jit_requested && BITS_PER_LONG == 64 &&
- insn->imm == BPF_FUNC_kptr_xchg &&
- bpf_jit_supports_ptr_xchg()) {
- insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_2);
- insn_buf[1] = BPF_ATOMIC_OP(BPF_DW, BPF_XCHG, BPF_REG_1, BPF_REG_0, 0);
- cnt = 2;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto next_insn;
- }
-patch_call_imm:
- fn = env->ops->get_func_proto(insn->imm, env->prog);
- /* all functions that have prototype and verifier allowed
- * programs to call them, must be real in-kernel functions
- */
- if (!fn->func) {
- verifier_bug(env,
- "not inlined functions %s#%d is missing func",
- func_id_name(insn->imm), insn->imm);
- return -EFAULT;
- }
- insn->imm = fn->func - __bpf_call_base;
-next_insn:
- if (subprogs[cur_subprog + 1].start == i + delta + 1) {
- subprogs[cur_subprog].stack_depth += stack_depth_extra;
- subprogs[cur_subprog].stack_extra = stack_depth_extra;
-
- stack_depth = subprogs[cur_subprog].stack_depth;
- if (stack_depth > MAX_BPF_STACK && !prog->jit_requested) {
- verbose(env, "stack size %d(extra %d) is too large\n",
- stack_depth, stack_depth_extra);
- return -EINVAL;
- }
- cur_subprog++;
- stack_depth = subprogs[cur_subprog].stack_depth;
- stack_depth_extra = 0;
- }
- i++;
- insn++;
- }
-
- env->prog->aux->stack_depth = subprogs[0].stack_depth;
- for (i = 0; i < env->subprog_cnt; i++) {
- int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1;
- int subprog_start = subprogs[i].start;
- int stack_slots = subprogs[i].stack_extra / 8;
- int slots = delta, cnt = 0;
-
- if (!stack_slots)
- continue;
- /* We need two slots in case timed may_goto is supported. */
- if (stack_slots > slots) {
- verifier_bug(env, "stack_slots supports may_goto only");
- return -EFAULT;
- }
-
- stack_depth = subprogs[i].stack_depth;
- if (bpf_jit_supports_timed_may_goto()) {
- insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
- BPF_MAX_TIMED_LOOPS);
- insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0);
- } else {
- /* Add ST insn to subprog prologue to init extra stack */
- insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
- BPF_MAX_LOOPS);
- }
- /* Copy first actual insn to preserve it */
- insn_buf[cnt++] = env->prog->insnsi[subprog_start];
-
- new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
- env->prog = prog = new_prog;
- /*
- * If may_goto is a first insn of a prog there could be a jmp
- * insn that points to it, hence adjust all such jmps to point
- * to insn after BPF_ST that inits may_goto count.
- * Adjustment will succeed because bpf_patch_insn_data() didn't fail.
- */
- WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta));
- }
-
- /* Since poke tab is now finalized, publish aux to tracker. */
- for (i = 0; i < prog->aux->size_poke_tab; i++) {
- map_ptr = prog->aux->poke_tab[i].tail_call.map;
- if (!map_ptr->ops->map_poke_track ||
- !map_ptr->ops->map_poke_untrack ||
- !map_ptr->ops->map_poke_run) {
- verifier_bug(env, "poke tab is misconfigured");
- return -EFAULT;
- }
-
- ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux);
- if (ret < 0) {
- verbose(env, "tracking tail call prog failed\n");
- return ret;
- }
- }
-
- ret = sort_kfunc_descs_by_imm_off(env);
- if (ret)
- return ret;
-
- return 0;
-}
-
-static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
- int position,
- s32 stack_base,
- u32 callback_subprogno,
- u32 *total_cnt)
-{
- s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
- s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
- s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
- int reg_loop_max = BPF_REG_6;
- int reg_loop_cnt = BPF_REG_7;
- int reg_loop_ctx = BPF_REG_8;
-
- struct bpf_insn *insn_buf = env->insn_buf;
- struct bpf_prog *new_prog;
- u32 callback_start;
- u32 call_insn_offset;
- s32 callback_offset;
- u32 cnt = 0;
-
- /* This represents an inlined version of bpf_iter.c:bpf_loop,
- * be careful to modify this code in sync.
- */
-
- /* Return error and jump to the end of the patch if
- * expected number of iterations is too big.
- */
- insn_buf[cnt++] = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2);
- insn_buf[cnt++] = BPF_MOV32_IMM(BPF_REG_0, -E2BIG);
- insn_buf[cnt++] = BPF_JMP_IMM(BPF_JA, 0, 0, 16);
- /* spill R6, R7, R8 to use these as loop vars */
- insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset);
- insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset);
- insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset);
- /* initialize loop vars */
- insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_max, BPF_REG_1);
- insn_buf[cnt++] = BPF_MOV32_IMM(reg_loop_cnt, 0);
- insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3);
- /* loop header,
- * if reg_loop_cnt >= reg_loop_max skip the loop body
- */
- insn_buf[cnt++] = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5);
- /* callback call,
- * correct callback offset would be set after patching
- */
- insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt);
- insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx);
- insn_buf[cnt++] = BPF_CALL_REL(0);
- /* increment loop counter */
- insn_buf[cnt++] = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1);
- /* jump to loop header if callback returned 0 */
- insn_buf[cnt++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6);
- /* return value of bpf_loop,
- * set R0 to the number of iterations
- */
- insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt);
- /* restore original values of R6, R7, R8 */
- insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset);
- insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset);
- insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset);
-
- *total_cnt = cnt;
- new_prog = bpf_patch_insn_data(env, position, insn_buf, cnt);
- if (!new_prog)
- return new_prog;
-
- /* callback start is known only after patching */
- callback_start = env->subprog_info[callback_subprogno].start;
- /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */
- call_insn_offset = position + 12;
- callback_offset = callback_start - call_insn_offset - 1;
- new_prog->insnsi[call_insn_offset].imm = callback_offset;
-
- return new_prog;
-}
-
-static bool is_bpf_loop_call(struct bpf_insn *insn)
-{
- return insn->code == (BPF_JMP | BPF_CALL) &&
- insn->src_reg == 0 &&
- insn->imm == BPF_FUNC_loop;
-}
-
-/* For all sub-programs in the program (including main) check
- * insn_aux_data to see if there are bpf_loop calls that require
- * inlining. If such calls are found the calls are replaced with a
- * sequence of instructions produced by `inline_bpf_loop` function and
- * subprog stack_depth is increased by the size of 3 registers.
- * This stack space is used to spill values of the R6, R7, R8. These
- * registers are used to store the loop bound, counter and context
- * variables.
- */
-static int optimize_bpf_loop(struct bpf_verifier_env *env)
-{
- struct bpf_subprog_info *subprogs = env->subprog_info;
- int i, cur_subprog = 0, cnt, delta = 0;
- struct bpf_insn *insn = env->prog->insnsi;
- int insn_cnt = env->prog->len;
- u16 stack_depth = subprogs[cur_subprog].stack_depth;
- u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
- u16 stack_depth_extra = 0;
-
- for (i = 0; i < insn_cnt; i++, insn++) {
- struct bpf_loop_inline_state *inline_state =
- &env->insn_aux_data[i + delta].loop_inline_state;
-
- if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) {
- struct bpf_prog *new_prog;
-
- stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup;
- new_prog = inline_bpf_loop(env,
- i + delta,
- -(stack_depth + stack_depth_extra),
- inline_state->callback_subprogno,
- &cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- }
-
- if (subprogs[cur_subprog + 1].start == i + delta + 1) {
- subprogs[cur_subprog].stack_depth += stack_depth_extra;
- cur_subprog++;
- stack_depth = subprogs[cur_subprog].stack_depth;
- stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
- stack_depth_extra = 0;
- }
- }
-
- env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
-
- return 0;
-}
-
-/* Remove unnecessary spill/fill pairs, members of fastcall pattern,
- * adjust subprograms stack depth when possible.
- */
-static int remove_fastcall_spills_fills(struct bpf_verifier_env *env)
-{
- struct bpf_subprog_info *subprog = env->subprog_info;
- struct bpf_insn_aux_data *aux = env->insn_aux_data;
- struct bpf_insn *insn = env->prog->insnsi;
- int insn_cnt = env->prog->len;
- u32 spills_num;
- bool modified = false;
- int i, j;
-
- for (i = 0; i < insn_cnt; i++, insn++) {
- if (aux[i].fastcall_spills_num > 0) {
- spills_num = aux[i].fastcall_spills_num;
- /* NOPs would be removed by opt_remove_nops() */
- for (j = 1; j <= spills_num; ++j) {
- *(insn - j) = NOP;
- *(insn + j) = NOP;
- }
- modified = true;
- }
- if ((subprog + 1)->start == i + 1) {
- if (modified && !subprog->keep_fastcall_stack)
- subprog->stack_depth = -subprog->fastcall_stack_off;
- subprog++;
- modified = false;
- }
- }
-
- return 0;
-}
static void free_states(struct bpf_verifier_env *env)
{
@@ -24533,13 +18625,13 @@ static void free_states(struct bpf_verifier_env *env)
struct bpf_scc_info *info;
int i, j;
- free_verifier_state(env->cur_state, true);
+ bpf_free_verifier_state(env->cur_state, true);
env->cur_state = NULL;
while (!pop_stack(env, NULL, NULL, false));
list_for_each_safe(pos, tmp, &env->free_list) {
sl = container_of(pos, struct bpf_verifier_state_list, node);
- free_verifier_state(&sl->state, false);
+ bpf_free_verifier_state(&sl->state, false);
kfree(sl);
}
INIT_LIST_HEAD(&env->free_list);
@@ -24549,7 +18641,7 @@ static void free_states(struct bpf_verifier_env *env)
if (!info)
continue;
for (j = 0; j < info->num_visits; j++)
- free_backedges(&info->visits[j]);
+ bpf_free_backedges(&info->visits[j]);
kvfree(info);
env->scc_info[i] = NULL;
}
@@ -24562,7 +18654,7 @@ static void free_states(struct bpf_verifier_env *env)
list_for_each_safe(pos, tmp, head) {
sl = container_of(pos, struct bpf_verifier_state_list, node);
- free_verifier_state(&sl->state, false);
+ bpf_free_verifier_state(&sl->state, false);
kfree(sl);
}
INIT_LIST_HEAD(&env->explored_states[i]);
@@ -24615,10 +18707,18 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
if (subprog_is_exc_cb(env, subprog)) {
state->frame[0]->in_exception_callback_fn = true;
- /* We have already ensured that the callback returns an integer, just
- * like all global subprogs. We need to determine it only has a single
- * scalar argument.
+
+ /*
+ * Global functions are scalar or void, make sure
+ * we return a scalar.
*/
+ if (subprog_returns_void(env, subprog)) {
+ verbose(env, "exception cb cannot return void\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Also ensure the callback only has a single scalar argument. */
if (sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_ANYTHING) {
verbose(env, "exception cb only supports single integer argument\n");
ret = -EINVAL;
@@ -24735,7 +18835,7 @@ static int do_check_subprogs(struct bpf_verifier_env *env)
again:
new_cnt = 0;
for (i = 1; i < env->subprog_cnt; i++) {
- if (!subprog_is_global(env, i))
+ if (!bpf_subprog_is_global(env, i))
continue;
sub_aux = subprog_aux(env, i);
@@ -24897,7 +18997,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
for (i = 0; i < st_ops_desc->arg_info[member_idx].cnt; i++) {
- if (st_ops_desc->arg_info[member_idx].info->refcounted) {
+ if (st_ops_desc->arg_info[member_idx].info[i].refcounted) {
has_refcounted_arg = true;
break;
}
@@ -24925,14 +19025,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
#define SECURITY_PREFIX "security_"
-static int check_attach_modify_return(unsigned long addr, const char *func_name)
-{
- if (within_error_injection_list(addr) ||
- !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1))
- return 0;
-
- return -EINVAL;
-}
+#ifdef CONFIG_FUNCTION_ERROR_INJECTION
/* list of non-sleepable functions that are otherwise on
* ALLOW_ERROR_INJECTION list
@@ -24955,6 +19048,75 @@ static int check_non_sleepable_error_inject(u32 btf_id)
return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id);
}
+static int check_attach_sleepable(u32 btf_id, unsigned long addr, const char *func_name)
+{
+ /* fentry/fexit/fmod_ret progs can be sleepable if they are
+ * attached to ALLOW_ERROR_INJECTION and are not in denylist.
+ */
+ if (!check_non_sleepable_error_inject(btf_id) &&
+ within_error_injection_list(addr))
+ return 0;
+
+ return -EINVAL;
+}
+
+static int check_attach_modify_return(unsigned long addr, const char *func_name)
+{
+ if (within_error_injection_list(addr) ||
+ !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1))
+ return 0;
+
+ return -EINVAL;
+}
+
+#else
+
+/* Unfortunately, the arch-specific prefixes are hard-coded in arch syscall code
+ * so we need to hard-code them, too. Ftrace has arch_syscall_match_sym_name()
+ * but that just compares two concrete function names.
+ */
+static bool has_arch_syscall_prefix(const char *func_name)
+{
+#if defined(__x86_64__)
+ return !strncmp(func_name, "__x64_", 6);
+#elif defined(__i386__)
+ return !strncmp(func_name, "__ia32_", 7);
+#elif defined(__s390x__)
+ return !strncmp(func_name, "__s390x_", 8);
+#elif defined(__aarch64__)
+ return !strncmp(func_name, "__arm64_", 8);
+#elif defined(__riscv)
+ return !strncmp(func_name, "__riscv_", 8);
+#elif defined(__powerpc__) || defined(__powerpc64__)
+ return !strncmp(func_name, "sys_", 4);
+#elif defined(__loongarch__)
+ return !strncmp(func_name, "sys_", 4);
+#else
+ return false;
+#endif
+}
+
+/* Without error injection, allow sleepable and fmod_ret progs on syscalls. */
+
+static int check_attach_sleepable(u32 btf_id, unsigned long addr, const char *func_name)
+{
+ if (has_arch_syscall_prefix(func_name))
+ return 0;
+
+ return -EINVAL;
+}
+
+static int check_attach_modify_return(unsigned long addr, const char *func_name)
+{
+ if (has_arch_syscall_prefix(func_name) ||
+ !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1))
+ return 0;
+
+ return -EINVAL;
+}
+
+#endif /* CONFIG_FUNCTION_ERROR_INJECTION */
+
int bpf_check_attach_target(struct bpf_verifier_log *log,
const struct bpf_prog *prog,
const struct bpf_prog *tgt_prog,
@@ -24981,7 +19143,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf;
if (!btf) {
bpf_log(log,
- "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n");
+ "Tracing program can only be attached to another program annotated with BTF\n");
return -EINVAL;
}
t = btf_type_by_id(btf, btf_id);
@@ -25017,7 +19179,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
if (aux->func && aux->func[subprog]->aux->exception_cb) {
bpf_log(log,
"%s programs cannot attach to exception callback\n",
- prog_extension ? "Extension" : "FENTRY/FEXIT");
+ prog_extension ? "Extension" : "Tracing");
return -EINVAL;
}
conservative = aux->func_info_aux[subprog].unreliable;
@@ -25106,7 +19268,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
case BPF_TRACE_RAW_TP:
if (tgt_prog) {
bpf_log(log,
- "Only FENTRY/FEXIT progs are attachable to another BPF prog\n");
+ "Only FENTRY/FEXIT/FSESSION progs are attachable to another BPF prog\n");
return -EINVAL;
}
if (!btf_type_is_typedef(t)) {
@@ -25234,12 +19396,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
ret = -EINVAL;
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
-
- /* fentry/fexit/fmod_ret progs can be sleepable if they are
- * attached to ALLOW_ERROR_INJECTION and are not in denylist.
- */
- if (!check_non_sleepable_error_inject(btf_id) &&
- within_error_injection_list(addr))
+ if (!check_attach_sleepable(btf_id, addr, tname))
ret = 0;
/* fentry/fexit/fmod_ret progs can also be sleepable if they are
* in the fmodret id set with the KF_SLEEPABLE flag.
@@ -25377,7 +19534,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
}
if (prog->sleepable && !can_be_sleepable(prog)) {
- verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
+ verbose(env, "Only fentry/fexit/fsession/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
return -EINVAL;
}
@@ -25525,430 +19682,209 @@ static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr,
return 0;
}
-/* Each field is a register bitmask */
-struct insn_live_regs {
- u16 use; /* registers read by instruction */
- u16 def; /* registers written by instruction */
- u16 in; /* registers that may be alive before instruction */
- u16 out; /* registers that may be alive after instruction */
-};
+/* replace a generic kfunc with a specialized version if necessary */
+static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, int insn_idx)
+{
+ struct bpf_prog *prog = env->prog;
+ bool seen_direct_write;
+ void *xdp_kfunc;
+ bool is_rdonly;
+ u32 func_id = desc->func_id;
+ u16 offset = desc->offset;
+ unsigned long addr = desc->addr;
-/* Bitmask with 1s for all caller saved registers */
-#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1)
+ if (offset) /* return if module BTF is used */
+ return 0;
-/* Compute info->{use,def} fields for the instruction */
-static void compute_insn_live_regs(struct bpf_verifier_env *env,
- struct bpf_insn *insn,
- struct insn_live_regs *info)
-{
- struct call_summary cs;
- u8 class = BPF_CLASS(insn->code);
- u8 code = BPF_OP(insn->code);
- u8 mode = BPF_MODE(insn->code);
- u16 src = BIT(insn->src_reg);
- u16 dst = BIT(insn->dst_reg);
- u16 r0 = BIT(0);
- u16 def = 0;
- u16 use = 0xffff;
+ if (bpf_dev_bound_kfunc_id(func_id)) {
+ xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id);
+ if (xdp_kfunc)
+ addr = (unsigned long)xdp_kfunc;
+ /* fallback to default kfunc when not supported by netdev */
+ } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
+ seen_direct_write = env->seen_direct_write;
+ is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
- switch (class) {
- case BPF_LD:
- switch (mode) {
- case BPF_IMM:
- if (BPF_SIZE(insn->code) == BPF_DW) {
- def = dst;
- use = 0;
- }
- break;
- case BPF_LD | BPF_ABS:
- case BPF_LD | BPF_IND:
- /* stick with defaults */
- break;
- }
- break;
- case BPF_LDX:
- switch (mode) {
- case BPF_MEM:
- case BPF_MEMSX:
- def = dst;
- use = src;
- break;
- }
- break;
- case BPF_ST:
- switch (mode) {
- case BPF_MEM:
- def = 0;
- use = dst;
- break;
- }
- break;
- case BPF_STX:
- switch (mode) {
- case BPF_MEM:
- def = 0;
- use = dst | src;
- break;
- case BPF_ATOMIC:
- switch (insn->imm) {
- case BPF_CMPXCHG:
- use = r0 | dst | src;
- def = r0;
- break;
- case BPF_LOAD_ACQ:
- def = dst;
- use = src;
- break;
- case BPF_STORE_REL:
- def = 0;
- use = dst | src;
- break;
- default:
- use = dst | src;
- if (insn->imm & BPF_FETCH)
- def = src;
- else
- def = 0;
- }
- break;
- }
- break;
- case BPF_ALU:
- case BPF_ALU64:
- switch (code) {
- case BPF_END:
- use = dst;
- def = dst;
- break;
- case BPF_MOV:
- def = dst;
- if (BPF_SRC(insn->code) == BPF_K)
- use = 0;
- else
- use = src;
- break;
- default:
- def = dst;
- if (BPF_SRC(insn->code) == BPF_K)
- use = dst;
- else
- use = dst | src;
- }
- break;
- case BPF_JMP:
- case BPF_JMP32:
- switch (code) {
- case BPF_JA:
- def = 0;
- if (BPF_SRC(insn->code) == BPF_X)
- use = dst;
- else
- use = 0;
- break;
- case BPF_JCOND:
- def = 0;
- use = 0;
- break;
- case BPF_EXIT:
- def = 0;
- use = r0;
- break;
- case BPF_CALL:
- def = ALL_CALLER_SAVED_REGS;
- use = def & ~BIT(BPF_REG_0);
- if (get_call_summary(env, insn, &cs))
- use = GENMASK(cs.num_params, 1);
- break;
- default:
- def = 0;
- if (BPF_SRC(insn->code) == BPF_K)
- use = dst;
- else
- use = dst | src;
- }
- break;
+ if (is_rdonly)
+ addr = (unsigned long)bpf_dynptr_from_skb_rdonly;
+
+ /* restore env->seen_direct_write to its original value, since
+ * may_access_direct_pkt_data mutates it
+ */
+ env->seen_direct_write = seen_direct_write;
+ } else if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr]) {
+ if (bpf_lsm_has_d_inode_locked(prog))
+ addr = (unsigned long)bpf_set_dentry_xattr_locked;
+ } else if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr]) {
+ if (bpf_lsm_has_d_inode_locked(prog))
+ addr = (unsigned long)bpf_remove_dentry_xattr_locked;
+ } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
+ if (!env->insn_aux_data[insn_idx].non_sleepable)
+ addr = (unsigned long)bpf_dynptr_from_file_sleepable;
+ } else if (func_id == special_kfunc_list[KF_bpf_arena_alloc_pages]) {
+ if (env->insn_aux_data[insn_idx].non_sleepable)
+ addr = (unsigned long)bpf_arena_alloc_pages_non_sleepable;
+ } else if (func_id == special_kfunc_list[KF_bpf_arena_free_pages]) {
+ if (env->insn_aux_data[insn_idx].non_sleepable)
+ addr = (unsigned long)bpf_arena_free_pages_non_sleepable;
}
+ desc->addr = addr;
+ return 0;
+}
+
+static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
+ u16 struct_meta_reg,
+ u16 node_offset_reg,
+ struct bpf_insn *insn,
+ struct bpf_insn *insn_buf,
+ int *cnt)
+{
+ struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) };
- info->def = def;
- info->use = use;
+ insn_buf[0] = addr[0];
+ insn_buf[1] = addr[1];
+ insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off);
+ insn_buf[3] = *insn;
+ *cnt = 4;
}
-/* Compute may-live registers after each instruction in the program.
- * The register is live after the instruction I if it is read by some
- * instruction S following I during program execution and is not
- * overwritten between I and S.
- *
- * Store result in env->insn_aux_data[i].live_regs.
- */
-static int compute_live_registers(struct bpf_verifier_env *env)
+int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ struct bpf_insn *insn_buf, int insn_idx, int *cnt)
{
- struct bpf_insn_aux_data *insn_aux = env->insn_aux_data;
- struct bpf_insn *insns = env->prog->insnsi;
- struct insn_live_regs *state;
- int insn_cnt = env->prog->len;
- int err = 0, i, j;
- bool changed;
-
- /* Use the following algorithm:
- * - define the following:
- * - I.use : a set of all registers read by instruction I;
- * - I.def : a set of all registers written by instruction I;
- * - I.in : a set of all registers that may be alive before I execution;
- * - I.out : a set of all registers that may be alive after I execution;
- * - insn_successors(I): a set of instructions S that might immediately
- * follow I for some program execution;
- * - associate separate empty sets 'I.in' and 'I.out' with each instruction;
- * - visit each instruction in a postorder and update
- * state[i].in, state[i].out as follows:
- *
- * state[i].out = U [state[s].in for S in insn_successors(i)]
- * state[i].in = (state[i].out / state[i].def) U state[i].use
- *
- * (where U stands for set union, / stands for set difference)
- * - repeat the computation while {in,out} fields changes for
- * any instruction.
+ struct bpf_kfunc_desc *desc;
+ int err;
+
+ if (!insn->imm) {
+ verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
+ return -EINVAL;
+ }
+
+ *cnt = 0;
+
+ /* insn->imm has the btf func_id. Replace it with an offset relative to
+ * __bpf_call_base, unless the JIT needs to call functions that are
+ * further than 32 bits away (bpf_jit_supports_far_kfunc_call()).
*/
- state = kvzalloc_objs(*state, insn_cnt, GFP_KERNEL_ACCOUNT);
- if (!state) {
- err = -ENOMEM;
- goto out;
+ desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
+ if (!desc) {
+ verifier_bug(env, "kernel function descriptor not found for func_id %u",
+ insn->imm);
+ return -EFAULT;
}
- for (i = 0; i < insn_cnt; ++i)
- compute_insn_live_regs(env, &insns[i], &state[i]);
-
- changed = true;
- while (changed) {
- changed = false;
- for (i = 0; i < env->cfg.cur_postorder; ++i) {
- int insn_idx = env->cfg.insn_postorder[i];
- struct insn_live_regs *live = &state[insn_idx];
- struct bpf_iarray *succ;
- u16 new_out = 0;
- u16 new_in = 0;
-
- succ = bpf_insn_successors(env, insn_idx);
- for (int s = 0; s < succ->cnt; ++s)
- new_out |= state[succ->items[s]].in;
- new_in = (new_out & ~live->def) | live->use;
- if (new_out != live->out || new_in != live->in) {
- live->in = new_in;
- live->out = new_out;
- changed = true;
- }
+ err = specialize_kfunc(env, desc, insn_idx);
+ if (err)
+ return err;
+
+ if (!bpf_jit_supports_far_kfunc_call())
+ insn->imm = BPF_CALL_IMM(desc->addr);
+
+ if (is_bpf_obj_new_kfunc(desc->func_id) || is_bpf_percpu_obj_new_kfunc(desc->func_id)) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+ u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;
+
+ if (is_bpf_percpu_obj_new_kfunc(desc->func_id) && kptr_struct_meta) {
+ verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
+ return -EFAULT;
}
- }
- for (i = 0; i < insn_cnt; ++i)
- insn_aux[i].live_regs_before = state[i].in;
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size);
+ insn_buf[1] = addr[0];
+ insn_buf[2] = addr[1];
+ insn_buf[3] = *insn;
+ *cnt = 4;
+ } else if (is_bpf_obj_drop_kfunc(desc->func_id) ||
+ is_bpf_percpu_obj_drop_kfunc(desc->func_id) ||
+ is_bpf_refcount_acquire_kfunc(desc->func_id)) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
- if (env->log.level & BPF_LOG_LEVEL2) {
- verbose(env, "Live regs before insn:\n");
- for (i = 0; i < insn_cnt; ++i) {
- if (env->insn_aux_data[i].scc)
- verbose(env, "%3d ", env->insn_aux_data[i].scc);
- else
- verbose(env, " ");
- verbose(env, "%3d: ", i);
- for (j = BPF_REG_0; j < BPF_REG_10; ++j)
- if (insn_aux[i].live_regs_before & BIT(j))
- verbose(env, "%d", j);
- else
- verbose(env, ".");
- verbose(env, " ");
- verbose_insn(env, &insns[i]);
- if (bpf_is_ldimm64(&insns[i]))
- i++;
+ if (is_bpf_percpu_obj_drop_kfunc(desc->func_id) && kptr_struct_meta) {
+ verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
+ return -EFAULT;
}
- }
-out:
- kvfree(state);
- return err;
-}
+ if (is_bpf_refcount_acquire_kfunc(desc->func_id) && !kptr_struct_meta) {
+ verifier_bug(env, "kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
+ return -EFAULT;
+ }
-/*
- * Compute strongly connected components (SCCs) on the CFG.
- * Assign an SCC number to each instruction, recorded in env->insn_aux[*].scc.
- * If instruction is a sole member of its SCC and there are no self edges,
- * assign it SCC number of zero.
- * Uses a non-recursive adaptation of Tarjan's algorithm for SCC computation.
- */
-static int compute_scc(struct bpf_verifier_env *env)
-{
- const u32 NOT_ON_STACK = U32_MAX;
+ insn_buf[0] = addr[0];
+ insn_buf[1] = addr[1];
+ insn_buf[2] = *insn;
+ *cnt = 3;
+ } else if (is_bpf_list_push_kfunc(desc->func_id) ||
+ is_bpf_rbtree_add_kfunc(desc->func_id)) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ int struct_meta_reg = BPF_REG_3;
+ int node_offset_reg = BPF_REG_4;
- struct bpf_insn_aux_data *aux = env->insn_aux_data;
- const u32 insn_cnt = env->prog->len;
- int stack_sz, dfs_sz, err = 0;
- u32 *stack, *pre, *low, *dfs;
- u32 i, j, t, w;
- u32 next_preorder_num;
- u32 next_scc_id;
- bool assign_scc;
- struct bpf_iarray *succ;
-
- next_preorder_num = 1;
- next_scc_id = 1;
- /*
- * - 'stack' accumulates vertices in DFS order, see invariant comment below;
- * - 'pre[t] == p' => preorder number of vertex 't' is 'p';
- * - 'low[t] == n' => smallest preorder number of the vertex reachable from 't' is 'n';
- * - 'dfs' DFS traversal stack, used to emulate explicit recursion.
- */
- stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
- pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
- low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
- dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL_ACCOUNT);
- if (!stack || !pre || !low || !dfs) {
- err = -ENOMEM;
- goto exit;
+ /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
+ if (is_bpf_rbtree_add_kfunc(desc->func_id)) {
+ struct_meta_reg = BPF_REG_4;
+ node_offset_reg = BPF_REG_5;
+ }
+
+ if (!kptr_struct_meta) {
+ verifier_bug(env, "kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
+ return -EFAULT;
+ }
+
+ __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
+ node_offset_reg, insn, insn_buf, cnt);
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
+ desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
+ *cnt = 1;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] &&
+ env->prog->expected_attach_type == BPF_TRACE_FSESSION) {
+ /*
+ * inline the bpf_session_is_return() for fsession:
+ * bool bpf_session_is_return(void *ctx)
+ * {
+ * return (((u64 *)ctx)[-1] >> BPF_TRAMP_IS_RETURN_SHIFT) & 1;
+ * }
+ */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_IS_RETURN_SHIFT);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1);
+ *cnt = 3;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] &&
+ env->prog->expected_attach_type == BPF_TRACE_FSESSION) {
+ /*
+ * inline bpf_session_cookie() for fsession:
+ * __u64 *bpf_session_cookie(void *ctx)
+ * {
+ * u64 off = (((u64 *)ctx)[-1] >> BPF_TRAMP_COOKIE_INDEX_SHIFT) & 0xFF;
+ * return &((u64 *)ctx)[-off];
+ * }
+ */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_COOKIE_INDEX_SHIFT);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
+ insn_buf[3] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ insn_buf[4] = BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1);
+ insn_buf[5] = BPF_ALU64_IMM(BPF_NEG, BPF_REG_0, 0);
+ *cnt = 6;
}
- /*
- * References:
- * [1] R. Tarjan "Depth-First Search and Linear Graph Algorithms"
- * [2] D. J. Pearce "A Space-Efficient Algorithm for Finding Strongly Connected Components"
- *
- * The algorithm maintains the following invariant:
- * - suppose there is a path 'u' ~> 'v', such that 'pre[v] < pre[u]';
- * - then, vertex 'u' remains on stack while vertex 'v' is on stack.
- *
- * Consequently:
- * - If 'low[v] < pre[v]', there is a path from 'v' to some vertex 'u',
- * such that 'pre[u] == low[v]'; vertex 'u' is currently on the stack,
- * and thus there is an SCC (loop) containing both 'u' and 'v'.
- * - If 'low[v] == pre[v]', loops containing 'v' have been explored,
- * and 'v' can be considered the root of some SCC.
- *
- * Here is a pseudo-code for an explicitly recursive version of the algorithm:
- *
- * NOT_ON_STACK = insn_cnt + 1
- * pre = [0] * insn_cnt
- * low = [0] * insn_cnt
- * scc = [0] * insn_cnt
- * stack = []
- *
- * next_preorder_num = 1
- * next_scc_id = 1
- *
- * def recur(w):
- * nonlocal next_preorder_num
- * nonlocal next_scc_id
- *
- * pre[w] = next_preorder_num
- * low[w] = next_preorder_num
- * next_preorder_num += 1
- * stack.append(w)
- * for s in successors(w):
- * # Note: for classic algorithm the block below should look as:
- * #
- * # if pre[s] == 0:
- * # recur(s)
- * # low[w] = min(low[w], low[s])
- * # elif low[s] != NOT_ON_STACK:
- * # low[w] = min(low[w], pre[s])
- * #
- * # But replacing both 'min' instructions with 'low[w] = min(low[w], low[s])'
- * # does not break the invariant and makes itartive version of the algorithm
- * # simpler. See 'Algorithm #3' from [2].
- *
- * # 's' not yet visited
- * if pre[s] == 0:
- * recur(s)
- * # if 's' is on stack, pick lowest reachable preorder number from it;
- * # if 's' is not on stack 'low[s] == NOT_ON_STACK > low[w]',
- * # so 'min' would be a noop.
- * low[w] = min(low[w], low[s])
- *
- * if low[w] == pre[w]:
- * # 'w' is the root of an SCC, pop all vertices
- * # below 'w' on stack and assign same SCC to them.
- * while True:
- * t = stack.pop()
- * low[t] = NOT_ON_STACK
- * scc[t] = next_scc_id
- * if t == w:
- * break
- * next_scc_id += 1
- *
- * for i in range(0, insn_cnt):
- * if pre[i] == 0:
- * recur(i)
- *
- * Below implementation replaces explicit recursion with array 'dfs'.
- */
- for (i = 0; i < insn_cnt; i++) {
- if (pre[i])
- continue;
- stack_sz = 0;
- dfs_sz = 1;
- dfs[0] = i;
-dfs_continue:
- while (dfs_sz) {
- w = dfs[dfs_sz - 1];
- if (pre[w] == 0) {
- low[w] = next_preorder_num;
- pre[w] = next_preorder_num;
- next_preorder_num++;
- stack[stack_sz++] = w;
- }
- /* Visit 'w' successors */
- succ = bpf_insn_successors(env, w);
- for (j = 0; j < succ->cnt; ++j) {
- if (pre[succ->items[j]]) {
- low[w] = min(low[w], low[succ->items[j]]);
- } else {
- dfs[dfs_sz++] = succ->items[j];
- goto dfs_continue;
- }
- }
- /*
- * Preserve the invariant: if some vertex above in the stack
- * is reachable from 'w', keep 'w' on the stack.
- */
- if (low[w] < pre[w]) {
- dfs_sz--;
- goto dfs_continue;
- }
- /*
- * Assign SCC number only if component has two or more elements,
- * or if component has a self reference, or if instruction is a
- * callback calling function (implicit loop).
- */
- assign_scc = stack[stack_sz - 1] != w; /* two or more elements? */
- for (j = 0; j < succ->cnt; ++j) { /* self reference? */
- if (succ->items[j] == w) {
- assign_scc = true;
- break;
- }
- }
- if (bpf_calls_callback(env, w)) /* implicit loop? */
- assign_scc = true;
- /* Pop component elements from stack */
- do {
- t = stack[--stack_sz];
- low[t] = NOT_ON_STACK;
- if (assign_scc)
- aux[t].scc = next_scc_id;
- } while (t != w);
- if (assign_scc)
- next_scc_id++;
- dfs_sz--;
- }
- }
- env->scc_info = kvzalloc_objs(*env->scc_info, next_scc_id,
- GFP_KERNEL_ACCOUNT);
- if (!env->scc_info) {
- err = -ENOMEM;
- goto exit;
- }
- env->scc_cnt = next_scc_id;
-exit:
- kvfree(stack);
- kvfree(pre);
- kvfree(low);
- kvfree(dfs);
- return err;
+
+ if (env->insn_aux_data[insn_idx].arg_prog) {
+ u32 regno = env->insn_aux_data[insn_idx].arg_prog;
+ struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) };
+ int idx = *cnt;
+
+ insn_buf[idx++] = ld_addrs[0];
+ insn_buf[idx++] = ld_addrs[1];
+ insn_buf[idx++] = *insn;
+ *cnt = idx;
+ }
+ return 0;
}
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
@@ -25982,7 +19918,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
goto err_free_env;
for (i = 0; i < len; i++)
env->insn_aux_data[i].orig_idx = i;
- env->succ = iarray_realloc(NULL, 2);
+ env->succ = bpf_iarray_realloc(NULL, 2);
if (!env->succ)
goto err_free_env;
env->prog = *prog;
@@ -26043,7 +19979,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
INIT_LIST_HEAD(&env->explored_states[i]);
INIT_LIST_HEAD(&env->free_list);
- ret = check_btf_info_early(env, attr, uattr);
+ ret = bpf_check_btf_info_early(env, attr, uattr);
if (ret < 0)
goto skip_full_check;
@@ -26055,11 +19991,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret < 0)
goto skip_full_check;
- ret = check_btf_info(env, attr, uattr);
+ ret = bpf_check_btf_info(env, attr, uattr);
if (ret < 0)
goto skip_full_check;
- ret = resolve_pseudo_ldimm64(env);
+ ret = check_and_resolve_insns(env);
if (ret < 0)
goto skip_full_check;
@@ -26069,11 +20005,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
goto skip_full_check;
}
- ret = check_cfg(env);
+ ret = bpf_check_cfg(env);
if (ret < 0)
goto skip_full_check;
- ret = compute_postorder(env);
+ ret = bpf_compute_postorder(env);
if (ret < 0)
goto skip_full_check;
@@ -26085,11 +20021,23 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret)
goto skip_full_check;
- ret = compute_scc(env);
+ ret = bpf_compute_const_regs(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = bpf_prune_dead_branches(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = sort_subprogs_topo(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = bpf_compute_scc(env);
if (ret < 0)
goto skip_full_check;
- ret = compute_live_registers(env);
+ ret = bpf_compute_live_registers(env);
if (ret < 0)
goto skip_full_check;
@@ -26110,22 +20058,22 @@ skip_full_check:
* allocate additional slots.
*/
if (ret == 0)
- ret = remove_fastcall_spills_fills(env);
+ ret = bpf_remove_fastcall_spills_fills(env);
if (ret == 0)
ret = check_max_stack_depth(env);
/* instruction rewrites happen after this point */
if (ret == 0)
- ret = optimize_bpf_loop(env);
+ ret = bpf_optimize_bpf_loop(env);
if (is_priv) {
if (ret == 0)
- opt_hard_wire_dead_code_branches(env);
+ bpf_opt_hard_wire_dead_code_branches(env);
if (ret == 0)
- ret = opt_remove_dead_code(env);
+ ret = bpf_opt_remove_dead_code(env);
if (ret == 0)
- ret = opt_remove_nops(env);
+ ret = bpf_opt_remove_nops(env);
} else {
if (ret == 0)
sanitize_dead_code(env);
@@ -26133,22 +20081,22 @@ skip_full_check:
if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
- ret = convert_ctx_accesses(env);
+ ret = bpf_convert_ctx_accesses(env);
if (ret == 0)
- ret = do_misc_fixups(env);
+ ret = bpf_do_misc_fixups(env);
/* do 32-bit optimization after insn patching has done so those patched
* insns could be handled correctly.
*/
if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) {
- ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
+ ret = bpf_opt_subreg_zext_lo32_rnd_hi32(env, attr);
env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
: false;
}
if (ret == 0)
- ret = fixup_call_args(env);
+ ret = bpf_fixup_call_args(env);
env->verification_time = ktime_get_ns() - start_time;
print_verification_stats(env);
@@ -26230,7 +20178,7 @@ err_release_maps:
err_unlock:
if (!is_priv)
mutex_unlock(&bpf_verifier_lock);
- clear_insn_aux_data(env, 0, env->prog->len);
+ bpf_clear_insn_aux_data(env, 0, env->prog->len);
vfree(env->insn_aux_data);
err_free_env:
bpf_stack_liveness_free(env);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a5dbb72528e0..058724c41c46 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -765,6 +765,14 @@ static unsigned int number_of_same_symbols(const char *mod, const char *func_nam
if (!mod)
kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count);
+ /*
+ * If the symbol is found in vmlinux, use vmlinux resolution only.
+ * This prevents module symbols from shadowing vmlinux symbols
+ * and causing -EADDRNOTAVAIL for unqualified kprobe targets.
+ */
+ if (!mod && ctx.count > 0)
+ return ctx.count;
+
module_kallsyms_on_each_symbol(mod, count_mod_symbols, &ctx);
return ctx.count;