From a8f500af0ccffc3d2aaf9018537981cb173865a1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:17:06 -0700 Subject: bpf: split explored_states split explored_states into prune_point boolean mark and link list of explored states. This removes STATE_LIST_MARK hack and allows marks to be separate from states. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1305ccbd8fe6..02bba09a0ea1 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -233,6 +233,7 @@ struct bpf_insn_aux_data { int sanitize_stack_off; /* stack slot to be cleared */ bool seen; /* this insn was processed by the verifier */ u8 alu_state; /* used in combination with alu_limit */ + bool prune_point; unsigned int orig_idx; /* original instruction index */ }; -- cgit v1.2.3 From dc2a4ebc0b44a212fcf72242210e56aa17e7317b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:17:07 -0700 Subject: bpf: convert explored_states to hash table All prune points inside a callee bpf function most likely will have different callsites. For example, if function foo() is called from two callsites the half of explored states in all prune points in foo() will be useless for subsequent walking of one of those callsites. Fortunately explored_states pruning heuristics keeps the number of states per prune point small, but walking these states is still a waste of cpu time when the callsite of the current state is different from the callsite of the explored state. To improve pruning logic convert explored_states into hash table and use simple insn_idx ^ callsite hash to select hash bucket. This optimization has no effect on programs without bpf2bpf calls and drastically improves programs with calls. In the later case it reduces total memory consumption in 1M scale tests by almost 3 times (peak_states drops from 5752 to 2016). Care should be taken when comparing the states for equivalency. Since the same hash bucket can now contain states with different indices the insn_idx has to be part of verifier_state and compared. Different hash table sizes and different hash functions were explored, but the results were not significantly better vs this patch. They can be improved in the future. Hit/miss heuristic is not counting index miscompare as a miss. Otherwise verifier stats become unstable when experimenting with different hash functions. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 02bba09a0ea1..405b502283c5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -187,6 +187,7 @@ struct bpf_func_state { struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; + u32 insn_idx; u32 curframe; u32 active_spin_lock; bool speculative; -- cgit v1.2.3 From 5327ed3d44b754f5cc51d5b3f18e442eaebacff5 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:12 +0100 Subject: bpf: verifier: mark verified-insn with sub-register zext flag eBPF ISA specification requires high 32-bit cleared when low 32-bit sub-register is written. This applies to destination register of ALU32 etc. JIT back-ends must guarantee this semantic when doing code-gen. x86_64 and AArch64 ISA has the same semantics, so the corresponding JIT back-end doesn't need to do extra work. However, 32-bit arches (arm, x86, nfp etc.) and some other 64-bit arches (PowerPC, SPARC etc) need to do explicit zero extension to meet this requirement, otherwise code like the following will fail. u64_value = (u64) u32_value ... other uses of u64_value This is because compiler could exploit the semantic described above and save those zero extensions for extending u32_value to u64_value, these JIT back-ends are expected to guarantee this through inserting extra zero extensions which however could be a significant increase on the code size. Some benchmarks show there could be ~40% sub-register writes out of total insns, meaning at least ~40% extra code-gen. One observation is these extra zero extensions are not always necessary. Take above code snippet for example, it is possible u32_value will never be casted into a u64, the value of high 32-bit of u32_value then could be ignored and extra zero extension could be eliminated. This patch implements this idea, insns defining sub-registers will be marked when the high 32-bit of the defined sub-register matters. For those unmarked insns, it is safe to eliminate high 32-bit clearnace for them. Algo: - Split read flags into READ32 and READ64. - Record index of insn that does sub-register write. Keep the index inside reg state and update it during verifier insn walking. - A full register read on a sub-register marks its definition insn as needing zero extension on dst register. A new sub-register write overrides the old one. - When propagating read64 during path pruning, also mark any insn defining a sub-register that is read in the pruned path as full-register. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 405b502283c5..704ed7971472 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -36,9 +36,11 @@ */ enum bpf_reg_liveness { REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ - REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */ - REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */ - REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */ + REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */ + REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ + REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, + REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */ + REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */ }; struct bpf_reg_state { @@ -131,6 +133,11 @@ struct bpf_reg_state { * pointing to bpf_func_state. */ u32 frameno; + /* Tracks subreg definition. The stored value is the insn_idx of the + * writing insn. This is safe because subreg_def is used before any insn + * patching which only happens after main verification finished. + */ + s32 subreg_def; enum bpf_reg_liveness live; }; @@ -233,6 +240,7 @@ struct bpf_insn_aux_data { int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ int sanitize_stack_off; /* stack slot to be cleared */ bool seen; /* this insn was processed by the verifier */ + bool zext_dst; /* this insn zero extends dst reg */ u8 alu_state; /* used in combination with alu_limit */ bool prune_point; unsigned int orig_idx; /* original instruction index */ -- cgit v1.2.3 From 7d134041a89610ae552501fc88652805addcdee4 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:14 +0100 Subject: bpf: introduce new mov32 variant for doing explicit zero extension The encoding for this new variant is based on BPF_X format. "imm" field was 0 only, now it could be 1 which means doing zero extension unconditionally .code = BPF_ALU | BPF_MOV | BPF_X .dst_reg = DST .src_reg = SRC .imm = 1 We use this new form for doing zero extension for which verifier will guarantee SRC == DST. Implications on JIT back-ends when doing code-gen for BPF_ALU | BPF_MOV | BPF_X: 1. No change if hardware already does zero extension unconditionally for sub-register write. 2. Otherwise, when seeing imm == 1, just generate insns to clear high 32-bit. No need to generate insns for the move because when imm == 1, dst_reg is the same as src_reg at the moment. Interpreter doesn't need change as well. It is doing unconditionally zero extension for mov32 already. One helper macro BPF_ZEXT_REG is added to help creating zero extension insn using this new mov32 variant. One helper function insn_is_zext is added for checking one insn is an zero extension on dst. This will be widely used by a few JIT back-ends in later patches in this set. Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 7148bab96943..bb10ffb88452 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -160,6 +160,20 @@ struct ctl_table_header; .off = 0, \ .imm = IMM }) +/* Special form of mov32, used for doing explicit zero extension on dst. */ +#define BPF_ZEXT_REG(DST) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = DST, \ + .off = 0, \ + .imm = 1 }) + +static inline bool insn_is_zext(const struct bpf_insn *insn) +{ + return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1; +} + /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ #define BPF_LD_IMM64(DST, IMM) \ BPF_LD_IMM64_RAW(DST, 0, IMM) -- cgit v1.2.3 From a4b1d3c1ddf6cb441187b6c130a473c16a05a356 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:15 +0100 Subject: bpf: verifier: insert zero extension according to analysis result After previous patches, verifier will mark a insn if it really needs zero extension on dst_reg. It is then for back-ends to decide how to use such information to eliminate unnecessary zero extension code-gen during JIT compilation. One approach is verifier insert explicit zero extension for those insns that need zero extension in a generic way, JIT back-ends then do not generate zero extension for sub-register write at default. However, only those back-ends which do not have hardware zero extension want this optimization. Back-ends like x86_64 and AArch64 have hardware zero extension support that the insertion should be disabled. This patch introduces new target hook "bpf_jit_needs_zext" which returns false at default, meaning verifier zero extension insertion is disabled at default. A back-end could override this hook to return true if it doesn't have hardware support and want verifier insert zero extension explicitly. Offload targets do not use this native target hook, instead, they could get the optimization results using bpf_prog_offload_ops.finalize. NOTE: arches could have diversified features, it is possible for one arch to have hardware zero extension support for some sub-register write insns but not for all. For example, PowerPC, SPARC have zero extended loads, but not for alu32. So when verifier zero extension insertion enabled, these JIT back-ends need to peephole insns to remove those zero extension inserted for insn that actually has hardware zero extension support. The peephole could be as simple as looking the next insn, if it is a special zero extension insn then it is safe to eliminate it if the current insn has hardware zero extension support. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/filter.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4fb3aa2dc975..d98141edb74b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -370,6 +370,7 @@ struct bpf_prog_aux { u32 id; u32 func_cnt; /* used by non-func prog as the number of func progs */ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ + bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ diff --git a/include/linux/filter.h b/include/linux/filter.h index bb10ffb88452..ba8b65270e0d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -825,6 +825,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); +bool bpf_jit_needs_zext(void); bool bpf_helper_changes_pkt_data(void *func); static inline bool bpf_dump_raw_ok(void) -- cgit v1.2.3 From 4bfc0bb2c60e2f4cc8eb60f03cf8dfa72336272a Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 25 May 2019 09:37:39 -0700 Subject: bpf: decouple the lifetime of cgroup_bpf from cgroup itself Currently the lifetime of bpf programs attached to a cgroup is bound to the lifetime of the cgroup itself. It means that if a user forgets (or intentionally avoids) to detach a bpf program before removing the cgroup, it will stay attached up to the release of the cgroup. Since the cgroup can stay in the dying state (the state between being rmdir()'ed and being released) for a very long time, it leads to a waste of memory. Also, it blocks a possibility to implement the memcg-based memory accounting for bpf objects, because a circular reference dependency will occur. Charged memory pages are pinning the corresponding memory cgroup, and if the memory cgroup is pinning the attached bpf program, nothing will be ever released. A dying cgroup can not contain any processes, so the only chance for an attached bpf program to be executed is a live socket associated with the cgroup. So in order to release all bpf data early, let's count associated sockets using a new percpu refcounter. On cgroup removal the counter is transitioned to the atomic mode, and as soon as it reaches 0, all bpf programs are detached. Because cgroup_bpf_release() can block, it can't be called from the percpu ref counter callback directly, so instead an asynchronous work is scheduled. The reference counter is not socket specific, and can be used for any other types of programs, which can be executed from a cgroup-bpf hook outside of the process context, had such a need arise in the future. Signed-off-by: Roman Gushchin Cc: jolsa@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 11 +++++++++-- include/linux/cgroup.h | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index cb3c6b3b89c8..9f100fc422c3 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -72,10 +73,16 @@ struct cgroup_bpf { /* temp storage for effective prog array used by prog_attach/detach */ struct bpf_prog_array __rcu *inactive; + + /* reference counter used to detach bpf programs after cgroup removal */ + struct percpu_ref refcnt; + + /* cgroup_bpf is released using a work queue */ + struct work_struct release_work; }; -void cgroup_bpf_put(struct cgroup *cgrp); int cgroup_bpf_inherit(struct cgroup *cgrp); +void cgroup_bpf_offline(struct cgroup *cgrp); int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); @@ -283,8 +290,8 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, struct bpf_prog; struct cgroup_bpf {}; -static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } +static inline void cgroup_bpf_offline(struct cgroup *cgrp) {} static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c0077adeea83..49e8facf7c4a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -924,4 +924,22 @@ static inline bool cgroup_task_frozen(struct task_struct *task) #endif /* !CONFIG_CGROUPS */ +#ifdef CONFIG_CGROUP_BPF +static inline void cgroup_bpf_get(struct cgroup *cgrp) +{ + percpu_ref_get(&cgrp->bpf.refcnt); +} + +static inline void cgroup_bpf_put(struct cgroup *cgrp) +{ + percpu_ref_put(&cgrp->bpf.refcnt); +} + +#else /* CONFIG_CGROUP_BPF */ + +static inline void cgroup_bpf_get(struct cgroup *cgrp) {} +static inline void cgroup_bpf_put(struct cgroup *cgrp) {} + +#endif /* CONFIG_CGROUP_BPF */ + #endif /* _LINUX_CGROUP_H */ -- cgit v1.2.3 From 54e9c9d4b506b611228890752d1cfa960e0965e1 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 May 2019 14:14:41 -0700 Subject: bpf: remove __rcu annotations from bpf_prog_array Drop __rcu annotations and rcu read sections from bpf_prog_array helper functions. They are not needed since all existing callers call those helpers from the rcu update side while holding a mutex. This guarantees that use-after-free could not happen. In the next patches I'll fix the callers with missing rcu_dereference_protected to make sparse/lockdep happy, the proper way to use these helpers is: struct bpf_prog_array __rcu *progs = ...; struct bpf_prog_array *p; mutex_lock(&mtx); p = rcu_dereference_protected(progs, lockdep_is_held(&mtx)); bpf_prog_array_length(p); bpf_prog_array_copy_to_user(p, ...); bpf_prog_array_delete_safe(p, ...); bpf_prog_array_copy_info(p, ...); bpf_prog_array_copy(p, ...); bpf_prog_array_free(p); mutex_unlock(&mtx); No functional changes! rcu_dereference_protected with lockdep_is_held should catch any cases where we update prog array without a mutex (I've looked at existing call sites and I think we hold a mutex everywhere). Motivation is to fix sparse warnings: kernel/bpf/core.c:1803:9: warning: incorrect type in argument 1 (different address spaces) kernel/bpf/core.c:1803:9: expected struct callback_head *head kernel/bpf/core.c:1803:9: got struct callback_head [noderef] * kernel/bpf/core.c:1877:44: warning: incorrect type in initializer (different address spaces) kernel/bpf/core.c:1877:44: expected struct bpf_prog_array_item *item kernel/bpf/core.c:1877:44: got struct bpf_prog_array_item [noderef] * kernel/bpf/core.c:1901:26: warning: incorrect type in assignment (different address spaces) kernel/bpf/core.c:1901:26: expected struct bpf_prog_array_item *existing kernel/bpf/core.c:1901:26: got struct bpf_prog_array_item [noderef] * kernel/bpf/core.c:1935:26: warning: incorrect type in assignment (different address spaces) kernel/bpf/core.c:1935:26: expected struct bpf_prog_array_item *[assigned] existing kernel/bpf/core.c:1935:26: got struct bpf_prog_array_item [noderef] * v2: * remove comment about potential race; that can't happen because all callers are in rcu-update section Cc: Roman Gushchin Acked-by: Roman Gushchin Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d98141edb74b..ff3e00ff84d2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -514,17 +514,17 @@ struct bpf_prog_array { }; struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); -void bpf_prog_array_free(struct bpf_prog_array __rcu *progs); -int bpf_prog_array_length(struct bpf_prog_array __rcu *progs); -int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, +void bpf_prog_array_free(struct bpf_prog_array *progs); +int bpf_prog_array_length(struct bpf_prog_array *progs); +int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, __u32 __user *prog_ids, u32 cnt); -void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, +void bpf_prog_array_delete_safe(struct bpf_prog_array *progs, struct bpf_prog *old_prog); -int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, +int bpf_prog_array_copy_info(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt); -int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, +int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, struct bpf_prog_array **new_array); -- cgit v1.2.3 From dbcc1ba26e43bd32cb308e50ac4cb4a29d2f5967 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 May 2019 14:14:43 -0700 Subject: bpf: cgroup: properly use bpf_prog_array api Now that we don't have __rcu markers on the bpf_prog_array helpers, let's use proper rcu_dereference_protected to obtain array pointer under mutex. We also don't need __rcu annotations on cgroup_bpf.inactive since it's not read/updated concurrently. v4: * drop cgroup_rcu_xyz wrappers and use rcu APIs directly; presumably should be more clear to understand which mutex/refcount protects each particular place v3: * amend cgroup_rcu_dereference to include percpu_ref_is_dying; cgroup_bpf is now reference counted and we don't hold cgroup_mutex anymore in cgroup_bpf_release v2: * replace xchg with rcu_swap_protected Cc: Roman Gushchin Signed-off-by: Stanislav Fomichev Acked-by: Roman Gushchin Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 9f100fc422c3..b631ee75762d 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -72,7 +72,7 @@ struct cgroup_bpf { u32 flags[MAX_BPF_ATTACH_TYPE]; /* temp storage for effective prog array used by prog_attach/detach */ - struct bpf_prog_array __rcu *inactive; + struct bpf_prog_array *inactive; /* reference counter used to detach bpf programs after cgroup removal */ struct percpu_ref refcnt; -- cgit v1.2.3 From 1f52f6c0b0e846908e9c1082dab1b3f7088b82ac Mon Sep 17 00:00:00 2001 From: brakmo Date: Tue, 28 May 2019 16:59:35 -0700 Subject: bpf: Create BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY Create new macro BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY() to be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs so BPF programs can request cwr for TCP packets. Current cgroup skb programs can only return 0 or 1 (0 to drop the packet. This macro changes the behavior so the low order bit indicates whether the packet should be dropped (0) or not (1) and the next bit is used for congestion notification (cn). Hence, new allowed return values of CGROUP EGRESS BPF programs are: 0: drop packet 1: keep packet 2: drop packet and call cwr 3: keep packet and call cwr This macro then converts it to one of NET_XMIT values or -EPERM that has the effect of dropping the packet with no cn. 0: NET_XMIT_SUCCESS skb should be transmitted (no cn) 1: NET_XMIT_DROP skb should be dropped and cwr called 2: NET_XMIT_CN skb should be transmitted and cwr called 3: -EPERM skb should be dropped (no cn) Note that when more than one BPF program is called, the packet is dropped if at least one of programs requests it be dropped, and there is cn if at least one program returns cn. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ff3e00ff84d2..2cc58fc0f413 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -552,6 +552,56 @@ _out: \ _ret; \ }) +/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs + * so BPF programs can request cwr for TCP packets. + * + * Current cgroup skb programs can only return 0 or 1 (0 to drop the + * packet. This macro changes the behavior so the low order bit + * indicates whether the packet should be dropped (0) or not (1) + * and the next bit is a congestion notification bit. This could be + * used by TCP to call tcp_enter_cwr() + * + * Hence, new allowed return values of CGROUP EGRESS BPF programs are: + * 0: drop packet + * 1: keep packet + * 2: drop packet and cn + * 3: keep packet and cn + * + * This macro then converts it to one of the NET_XMIT or an error + * code that is then interpreted as drop packet (and no cn): + * 0: NET_XMIT_SUCCESS skb should be transmitted + * 1: NET_XMIT_DROP skb should be dropped and cn + * 2: NET_XMIT_CN skb should be transmitted and cn + * 3: -EPERM skb should be dropped + */ +#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \ + ({ \ + struct bpf_prog_array_item *_item; \ + struct bpf_prog *_prog; \ + struct bpf_prog_array *_array; \ + u32 ret; \ + u32 _ret = 1; \ + u32 _cn = 0; \ + preempt_disable(); \ + rcu_read_lock(); \ + _array = rcu_dereference(array); \ + _item = &_array->items[0]; \ + while ((_prog = READ_ONCE(_item->prog))) { \ + bpf_cgroup_storage_set(_item->cgroup_storage); \ + ret = func(_prog, ctx); \ + _ret &= (ret & 1); \ + _cn |= (ret & 2); \ + _item++; \ + } \ + rcu_read_unlock(); \ + preempt_enable(); \ + if (_ret) \ + _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ + else \ + _ret = (_cn ? NET_XMIT_DROP : -EPERM); \ + _ret; \ + }) + #define BPF_PROG_RUN_ARRAY(array, ctx, func) \ __BPF_PROG_RUN_ARRAY(array, ctx, func, false) -- cgit v1.2.3 From 5cf1e91456301f8c4f6bbc63ff76cff12f92f31b Mon Sep 17 00:00:00 2001 From: brakmo Date: Tue, 28 May 2019 16:59:36 -0700 Subject: bpf: cgroup inet skb programs can return 0 to 3 Allows cgroup inet skb programs to return values in the range [0, 3]. The second bit is used to deterine if congestion occurred and higher level protocol should decrease rate. E.g. TCP would call tcp_enter_cwr() The bpf_prog must set expected_attach_type to BPF_CGROUP_INET_EGRESS at load time if it uses the new return values (i.e. 2 or 3). The expected_attach_type is currently not enforced for BPF_PROG_TYPE_CGROUP_SKB. e.g Meaning the current bpf_prog with expected_attach_type setting to BPF_CGROUP_INET_EGRESS can attach to BPF_CGROUP_INET_INGRESS. Blindly enforcing expected_attach_type will break backward compatibility. This patch adds a enforce_expected_attach_type bit to only enforce the expected_attach_type when it uses the new return value. Signed-off-by: Lawrence Brakmo Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index ba8b65270e0d..43b45d6db36d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -526,7 +526,8 @@ struct bpf_prog { blinded:1, /* Was blinded */ is_func:1, /* program is a bpf function */ kprobe_override:1, /* Do we override a kprobe? */ - has_callchain_buf:1; /* callchain buffer allocated? */ + has_callchain_buf:1, /* callchain buffer allocated? */ + enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ -- cgit v1.2.3 From 3539b96e041c06e4317082816d90ec09160aeb11 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 29 May 2019 18:03:57 -0700 Subject: bpf: group memory related fields in struct bpf_map_memory Group "user" and "pages" fields of bpf_map into the bpf_map_memory structure. Later it can be extended with "memcg" and other related information. The main reason for a such change (beside cosmetics) is to pass bpf_map_memory structure to charging functions before the actual allocation of bpf_map. Signed-off-by: Roman Gushchin Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2cc58fc0f413..2e7c1c40d949 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -66,6 +66,11 @@ struct bpf_map_ops { u64 imm, u32 *off); }; +struct bpf_map_memory { + u32 pages; + struct user_struct *user; +}; + struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). @@ -86,7 +91,7 @@ struct bpf_map { u32 btf_key_type_id; u32 btf_value_type_id; struct btf *btf; - u32 pages; + struct bpf_map_memory memory; bool unpriv_array; bool frozen; /* write-once */ /* 48 bytes hole */ @@ -94,8 +99,7 @@ struct bpf_map { /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ - struct user_struct *user ____cacheline_aligned; - atomic_t refcnt; + atomic_t refcnt ____cacheline_aligned; atomic_t usercnt; struct work_struct work; char name[BPF_OBJ_NAME_LEN]; -- cgit v1.2.3 From b936ca643ade11f265fa10e5fb71c20d9c5243f1 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 29 May 2019 18:03:58 -0700 Subject: bpf: rework memlock-based memory accounting for maps In order to unify the existing memlock charging code with the memcg-based memory accounting, which will be added later, let's rework the current scheme. Currently the following design is used: 1) .alloc() callback optionally checks if the allocation will likely succeed using bpf_map_precharge_memlock() 2) .alloc() performs actual allocations 3) .alloc() callback calculates map cost and sets map.memory.pages 4) map_create() calls bpf_map_init_memlock() which sets map.memory.user and performs actual charging; in case of failure the map is destroyed 1) bpf_map_free_deferred() calls bpf_map_release_memlock(), which performs uncharge and releases the user 2) .map_free() callback releases the memory The scheme can be simplified and made more robust: 1) .alloc() calculates map cost and calls bpf_map_charge_init() 2) bpf_map_charge_init() sets map.memory.user and performs actual charge 3) .alloc() performs actual allocations 1) .map_free() callback releases the memory 2) bpf_map_charge_finish() performs uncharge and releases the user The new scheme also allows to reuse bpf_map_charge_init()/finish() functions for memcg-based accounting. Because charges are performed before actual allocations and uncharges after freeing the memory, no bogus memory pressure can be created. In cases when the map structure is not available (e.g. it's not created yet, or is already destroyed), on-stack bpf_map_memory structure is used. The charge can be transferred with the bpf_map_charge_move() function. Signed-off-by: Roman Gushchin Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2e7c1c40d949..3c8f24f402bf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -650,9 +650,12 @@ struct bpf_map *__bpf_map_get(struct fd f); struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); -int bpf_map_precharge_memlock(u32 pages); int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); +int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages); +void bpf_map_charge_finish(struct bpf_map_memory *mem); +void bpf_map_charge_move(struct bpf_map_memory *dst, + struct bpf_map_memory *src); void *bpf_map_area_alloc(size_t size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); -- cgit v1.2.3 From c85d69135a9175c50a823d04d62d932312d037b3 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 29 May 2019 18:03:59 -0700 Subject: bpf: move memory size checks to bpf_map_charge_init() Most bpf map types doing similar checks and bytes to pages conversion during memory allocation and charging. Let's unify these checks by moving them into bpf_map_charge_init(). Signed-off-by: Roman Gushchin Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3c8f24f402bf..e5a309e6a400 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -652,7 +652,7 @@ void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); -int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages); +int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size); void bpf_map_charge_finish(struct bpf_map_memory *mem); void bpf_map_charge_move(struct bpf_map_memory *dst, struct bpf_map_memory *src); -- cgit v1.2.3