From fe21cb91ae7bca1ae7805454be80b6d03bec85f7 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 2 Jul 2021 16:48:21 +0530 Subject: net: core: Split out code to run generic XDP prog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helper can later be utilized in code that runs cpumap and devmap programs in generic redirect mode and adjust skb based on changes made to xdp_buff. When returning XDP_REDIRECT/XDP_TX, it invokes __skb_push, so whenever a generic redirect path invokes devmap/cpumap prog if set, it must __skb_pull again as we expect mac header to be pulled. It also drops the skb_reset_mac_len call after do_xdp_generic, as the mac_header and network_header are advanced by the same offset, so the difference (mac_len) remains constant. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Reviewed-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210702111825.491065-2-memxor@gmail.com --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eaf5bb008aa9..42f6f866d5f3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3984,6 +3984,8 @@ static inline void dev_consume_skb_any(struct sk_buff *skb) __dev_kfree_skb_any(skb, SKB_REASON_CONSUMED); } +u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog); void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); int netif_rx(struct sk_buff *skb); -- cgit v1.2.3 From cb0f80039fb7ec9981a74d22019daaa85ff51a3d Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 2 Jul 2021 16:48:22 +0530 Subject: bitops: Add non-atomic bitops for pointers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cpumap needs to set, clear, and test the lowest bit in skb pointer in various places. To make these checks less noisy, add pointer friendly bitop macros that also do some typechecking to sanitize the argument. These wrap the non-atomic bitops __set_bit, __clear_bit, and test_bit but for pointer arguments. Pointer's address has to be passed in and it is treated as an unsigned long *, since width and representation of pointer and unsigned long match on targets Linux supports. They are prefixed with double underscore to indicate lack of atomicity. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Reviewed-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210702111825.491065-3-memxor@gmail.com --- include/linux/bitops.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/typecheck.h | 9 +++++++++ 2 files changed, 59 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 26bf15e6cd35..5e62e2383b7f 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -4,6 +4,7 @@ #include #include +#include #include @@ -253,6 +254,55 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, __clear_bit(nr, addr); } +/** + * __ptr_set_bit - Set bit in a pointer's value + * @nr: the bit to set + * @addr: the address of the pointer variable + * + * Example: + * void *p = foo(); + * __ptr_set_bit(bit, &p); + */ +#define __ptr_set_bit(nr, addr) \ + ({ \ + typecheck_pointer(*(addr)); \ + __set_bit(nr, (unsigned long *)(addr)); \ + }) + +/** + * __ptr_clear_bit - Clear bit in a pointer's value + * @nr: the bit to clear + * @addr: the address of the pointer variable + * + * Example: + * void *p = foo(); + * __ptr_clear_bit(bit, &p); + */ +#define __ptr_clear_bit(nr, addr) \ + ({ \ + typecheck_pointer(*(addr)); \ + __clear_bit(nr, (unsigned long *)(addr)); \ + }) + +/** + * __ptr_test_bit - Test bit in a pointer's value + * @nr: the bit to test + * @addr: the address of the pointer variable + * + * Example: + * void *p = foo(); + * if (__ptr_test_bit(bit, &p)) { + * ... + * } else { + * ... + * } + */ +#define __ptr_test_bit(nr, addr) \ + ({ \ + typecheck_pointer(*(addr)); \ + test_bit(nr, (unsigned long *)(addr)); \ + }) + #ifdef __KERNEL__ #ifndef set_mask_bits diff --git a/include/linux/typecheck.h b/include/linux/typecheck.h index 20d310331eb5..46b15e2aaefb 100644 --- a/include/linux/typecheck.h +++ b/include/linux/typecheck.h @@ -22,4 +22,13 @@ (void)__tmp; \ }) +/* + * Check at compile time that something is a pointer type. + */ +#define typecheck_pointer(x) \ +({ typeof(x) __dummy; \ + (void)sizeof(*__dummy); \ + 1; \ +}) + #endif /* TYPECHECK_H_INCLUDED */ -- cgit v1.2.3 From 11941f8a85362f612df61f4aaab0e41b64d2111d Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 2 Jul 2021 16:48:23 +0530 Subject: bpf: cpumap: Implement generic cpumap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change implements CPUMAP redirect support for generic XDP programs. The idea is to reuse the cpu map entry's queue that is used to push native xdp frames for redirecting skb to a different CPU. This will match native XDP behavior (in that RPS is invoked again for packet reinjected into networking stack). To be able to determine whether the incoming skb is from the driver or cpumap, we reuse skb->redirected bit that skips generic XDP processing when it is set. To always make use of this, CONFIG_NET_REDIRECT guard on it has been lifted and it is always available. >From the redirect side, we add the skb to ptr_ring with its lowest bit set to 1. This should be safe as skb is not 1-byte aligned. This allows kthread to discern between xdp_frames and sk_buff. On consumption of the ptr_ring item, the lowest bit is unset. In the end, the skb is simply added to the list that kthread is anyway going to maintain for xdp_frames converted to skb, and then received again by using netif_receive_skb_list. Bulking optimization for generic cpumap is left as an exercise for a future patch for now. Since cpumap entry progs are now supported, also remove check in generic_xdp_install for the cpumap. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Reviewed-by: Toke Høiland-Jørgensen Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20210702111825.491065-4-memxor@gmail.com --- include/linux/bpf.h | 9 ++++++++- include/linux/skbuff.h | 10 ++-------- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f309fc1509f2..095aaa104c56 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1513,7 +1513,8 @@ bool dev_map_can_have_prog(struct bpf_map *map); void __cpu_map_flush(void); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); -bool cpu_map_prog_allowed(struct bpf_map *map); +int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, + struct sk_buff *skb); /* Return map's numa specified by userspace */ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) @@ -1710,6 +1711,12 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, return 0; } +static inline int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, + struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} + static inline bool cpu_map_prog_allowed(struct bpf_map *map) { return false; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b2db9cd9a73f..f19190820e63 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -863,8 +863,8 @@ struct sk_buff { __u8 tc_skip_classify:1; __u8 tc_at_ingress:1; #endif -#ifdef CONFIG_NET_REDIRECT __u8 redirected:1; +#ifdef CONFIG_NET_REDIRECT __u8 from_ingress:1; #endif #ifdef CONFIG_TLS_DEVICE @@ -4664,17 +4664,13 @@ static inline __wsum lco_csum(struct sk_buff *skb) static inline bool skb_is_redirected(const struct sk_buff *skb) { -#ifdef CONFIG_NET_REDIRECT return skb->redirected; -#else - return false; -#endif } static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress) { -#ifdef CONFIG_NET_REDIRECT skb->redirected = 1; +#ifdef CONFIG_NET_REDIRECT skb->from_ingress = from_ingress; if (skb->from_ingress) skb->tstamp = 0; @@ -4683,9 +4679,7 @@ static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress) static inline void skb_reset_redirect(struct sk_buff *skb) { -#ifdef CONFIG_NET_REDIRECT skb->redirected = 0; -#endif } static inline bool skb_csum_is_sctp(struct sk_buff *skb) -- cgit v1.2.3 From 2ea5eabaf04a1829383aefe98ac38a2e5ae2d698 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 2 Jul 2021 16:48:24 +0530 Subject: bpf: devmap: Implement devmap prog execution for generic XDP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This lifts the restriction on running devmap BPF progs in generic redirect mode. To match native XDP behavior, it is invoked right before generic_xdp_tx is called, and only supports XDP_PASS/XDP_ABORTED/ XDP_DROP actions. We also return 0 even if devmap program drops the packet, as semantically redirect has already succeeded and the devmap prog is the last point before TX of the packet to device where it can deliver a verdict on the packet. This also means it must take care of freeing the skb, as xdp_do_generic_redirect callers only do that in case an error is returned. Since devmap entry prog is supported, remove the check in generic_xdp_install entirely. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Reviewed-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210702111825.491065-5-memxor@gmail.com --- include/linux/bpf.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 095aaa104c56..4afbff308ca3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1508,7 +1508,6 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, struct bpf_prog *xdp_prog, struct bpf_map *map, bool exclude_ingress); -bool dev_map_can_have_prog(struct bpf_map *map); void __cpu_map_flush(void); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, -- cgit v1.2.3 From 5c2c85315948c42c6c0258cf9bad596acaa79043 Mon Sep 17 00:00:00 2001 From: Richard Laing Date: Thu, 15 Jul 2021 09:18:05 +1200 Subject: bus: mhi: pci-generic: configurable network interface MRU The MRU value used by the MHI MBIM network interface affects the throughput performance of the interface. Different modem models use different default MRU sizes based on their bandwidth capabilities. Large values generally result in higher throughput for larger packet sizes. In addition if the MRU used by the MHI device is larger than that specified in the MHI net device the data is fragmented and needs to be re-assembled which generates a (single) warning message about the fragmented packets. Setting the MRU on both ends avoids the extra processing to re-assemble the packets. This patch allows the documented MRU for a modem to be automatically set as the MHI net device MRU avoiding fragmentation and improving throughput performance. Signed-off-by: Richard Laing Signed-off-by: David S. Miller --- include/linux/mhi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 944aa3aa3035..beb918328eef 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -356,6 +356,7 @@ struct mhi_controller_config { * @fbc_download: MHI host needs to do complete image transfer (optional) * @wake_set: Device wakeup set flag * @irq_flags: irq flags passed to request_irq (optional) + * @mru: the default MRU for the MHI device * * Fields marked as (required) need to be populated by the controller driver * before calling mhi_register_controller(). For the fields marked as (optional) @@ -448,6 +449,7 @@ struct mhi_controller { bool fbc_download; bool wake_set; unsigned long irq_flags; + u32 mru; }; /** -- cgit v1.2.3 From b00628b1c7d595ae5b544e059c27b1f5828314b4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:09 -0700 Subject: bpf: Introduce bpf timers. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce 'struct bpf_timer { __u64 :64; __u64 :64; };' that can be embedded in hash/array/lru maps as a regular field and helpers to operate on it: // Initialize the timer. // First 4 bits of 'flags' specify clockid. // Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, int flags); // Configure the timer to call 'callback_fn' static function. long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn); // Arm the timer to expire 'nsec' nanoseconds from the current time. long bpf_timer_start(struct bpf_timer *timer, u64 nsec, u64 flags); // Cancel the timer and wait for callback_fn to finish if it was running. long bpf_timer_cancel(struct bpf_timer *timer); Here is how BPF program might look like: struct map_elem { int counter; struct bpf_timer timer; }; struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, 1000); __type(key, int); __type(value, struct map_elem); } hmap SEC(".maps"); static int timer_cb(void *map, int *key, struct map_elem *val); /* val points to particular map element that contains bpf_timer. */ SEC("fentry/bpf_fentry_test1") int BPF_PROG(test1, int a) { struct map_elem *val; int key = 0; val = bpf_map_lookup_elem(&hmap, &key); if (val) { bpf_timer_init(&val->timer, &hmap, CLOCK_REALTIME); bpf_timer_set_callback(&val->timer, timer_cb); bpf_timer_start(&val->timer, 1000 /* call timer_cb2 in 1 usec */, 0); } } This patch adds helper implementations that rely on hrtimers to call bpf functions as timers expire. The following patches add necessary safety checks. Only programs with CAP_BPF are allowed to use bpf_timer. The amount of timers used by the program is constrained by the memcg recorded at map creation time. The bpf_timer_init() helper needs explicit 'map' argument because inner maps are dynamic and not known at load time. While the bpf_timer_set_callback() is receiving hidden 'aux->prog' argument supplied by the verifier. The prog pointer is needed to do refcnting of bpf program to make sure that program doesn't get freed while the timer is armed. This approach relies on "user refcnt" scheme used in prog_array that stores bpf programs for bpf_tail_call. The bpf_timer_set_callback() will increment the prog refcnt which is paired with bpf_timer_cancel() that will drop the prog refcnt. The ops->map_release_uref is responsible for cancelling the timers and dropping prog refcnt when user space reference to a map reaches zero. This uref approach is done to make sure that Ctrl-C of user space process will not leave timers running forever unless the user space explicitly pinned a map that contained timers in bpffs. bpf_timer_init() and bpf_timer_set_callback() will return -EPERM if map doesn't have user references (is not held by open file descriptor from user space and not pinned in bpffs). The bpf_map_delete_elem() and bpf_map_update_elem() operations cancel and free the timer if given map element had it allocated. "bpftool map update" command can be used to cancel timers. The 'struct bpf_timer' is explicitly __attribute__((aligned(8))) because '__u64 :64' has 1 byte alignment of 8 byte padding. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-4-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4afbff308ca3..125240b7cefb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -168,6 +168,7 @@ struct bpf_map { u32 max_entries; u32 map_flags; int spin_lock_off; /* >=0 valid offset, <0 error */ + int timer_off; /* >=0 valid offset, <0 error */ u32 id; int numa_node; u32 btf_key_type_id; @@ -221,6 +222,7 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) } void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); +void bpf_timer_cancel_and_free(void *timer); int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); struct bpf_offload_dev; @@ -314,6 +316,7 @@ enum bpf_arg_type { ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */ ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ + ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ __BPF_ARG_TYPE_MAX, }; -- cgit v1.2.3 From 68134668c17f31f51930478f75495b552a411550 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:10 -0700 Subject: bpf: Add map side support for bpf timers. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restrict bpf timers to array, hash (both preallocated and kmalloced), and lru map types. The per-cpu maps with timers don't make sense, since 'struct bpf_timer' is a part of map value. bpf timers in per-cpu maps would mean that the number of timers depends on number of possible cpus and timers would not be accessible from all cpus. lpm map support can be added in the future. The timers in inner maps are supported. The bpf_map_update/delete_elem() helpers and sys_bpf commands cancel and free bpf_timer in a given map element. Similar to 'struct bpf_spin_lock' BTF is required and it is used to validate that map element indeed contains 'struct bpf_timer'. Make check_and_init_map_value() init both bpf_spin_lock and bpf_timer when map element data is reused in preallocated htab and lru maps. Teach copy_map_value() to support both bpf_spin_lock and bpf_timer in a single map element. There could be one of each, but not more than one. Due to 'one bpf_timer in one element' restriction do not support timers in global data, since global data is a map of single element, but from bpf program side it's seen as many global variables and restriction of single global timer would be odd. The sys_bpf map_freeze and sys_mmap syscalls are not allowed on maps with timers, since user space could have corrupted mmap element and crashed the kernel. The maps with timers cannot be readonly. Due to these restrictions search for bpf_timer in datasec BTF in case it was placed in the global data to report clear error. The previous patch allowed 'struct bpf_timer' as a first field in a map element only. Relax this restriction. Refactor lru map to s/bpf_lru_push_free/htab_lru_push_free/ to cancel and free the timer when lru map deletes an element as a part of it eviction algorithm. Make sure that bpf program cannot access 'struct bpf_timer' via direct load/store. The timer operation are done through helpers only. This is similar to 'struct bpf_spin_lock'. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-5-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 44 +++++++++++++++++++++++++++++++++----------- include/linux/btf.h | 1 + 2 files changed, 34 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 125240b7cefb..a9a4a480a6d0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -198,24 +198,46 @@ static inline bool map_value_has_spin_lock(const struct bpf_map *map) return map->spin_lock_off >= 0; } -static inline void check_and_init_map_lock(struct bpf_map *map, void *dst) +static inline bool map_value_has_timer(const struct bpf_map *map) { - if (likely(!map_value_has_spin_lock(map))) - return; - *(struct bpf_spin_lock *)(dst + map->spin_lock_off) = - (struct bpf_spin_lock){}; + return map->timer_off >= 0; } -/* copy everything but bpf_spin_lock */ +static inline void check_and_init_map_value(struct bpf_map *map, void *dst) +{ + if (unlikely(map_value_has_spin_lock(map))) + *(struct bpf_spin_lock *)(dst + map->spin_lock_off) = + (struct bpf_spin_lock){}; + if (unlikely(map_value_has_timer(map))) + *(struct bpf_timer *)(dst + map->timer_off) = + (struct bpf_timer){}; +} + +/* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) { + u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0; + if (unlikely(map_value_has_spin_lock(map))) { - u32 off = map->spin_lock_off; + s_off = map->spin_lock_off; + s_sz = sizeof(struct bpf_spin_lock); + } else if (unlikely(map_value_has_timer(map))) { + t_off = map->timer_off; + t_sz = sizeof(struct bpf_timer); + } - memcpy(dst, src, off); - memcpy(dst + off + sizeof(struct bpf_spin_lock), - src + off + sizeof(struct bpf_spin_lock), - map->value_size - off - sizeof(struct bpf_spin_lock)); + if (unlikely(s_sz || t_sz)) { + if (s_off < t_off || !s_sz) { + swap(s_off, t_off); + swap(s_sz, t_sz); + } + memcpy(dst, src, t_off); + memcpy(dst + t_off + t_sz, + src + t_off + t_sz, + s_off - t_off - t_sz); + memcpy(dst + s_off + s_sz, + src + s_off + s_sz, + map->value_size - s_off - s_sz); } else { memcpy(dst, src, map->value_size); } diff --git a/include/linux/btf.h b/include/linux/btf.h index 94a0c976c90f..214fde93214b 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -99,6 +99,7 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); +int btf_find_timer(const struct btf *btf, const struct btf_type *t); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, -- cgit v1.2.3 From 3e8ce29850f1839d0603f925b30be9d8a4329917 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:11 -0700 Subject: bpf: Prevent pointer mismatch in bpf_timer_init. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bpf_timer_init() arguments are: 1. pointer to a timer (which is embedded in map element). 2. pointer to a map. Make sure that pointer to a timer actually belongs to that map. Use map_uid (which is unique id of inner map) to reject: inner_map1 = bpf_map_lookup_elem(outer_map, key1) inner_map2 = bpf_map_lookup_elem(outer_map, key2) if (inner_map1 && inner_map2) { timer = bpf_map_lookup_elem(inner_map1); if (timer) // mismatch would have been allowed bpf_timer_init(timer, inner_map2); } Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-6-alexei.starovoitov@gmail.com --- include/linux/bpf_verifier.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e774ecc1cd1f..5d3169b57e6e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -53,7 +53,14 @@ struct bpf_reg_state { /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL */ - struct bpf_map *map_ptr; + struct { + struct bpf_map *map_ptr; + /* To distinguish map lookups from outer map + * the map_uid is non-zero for registers + * pointing to inner maps. + */ + u32 map_uid; + }; /* for PTR_TO_BTF_ID */ struct { -- cgit v1.2.3 From bfc6bb74e4f16ab264fa73398a7a79d7d2afac2e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:14 -0700 Subject: bpf: Implement verifier support for validation of async callbacks. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bpf_for_each_map_elem() and bpf_timer_set_callback() helpers are relying on PTR_TO_FUNC infra in the verifier to validate addresses to subprograms and pass them into the helpers as function callbacks. In case of bpf_for_each_map_elem() the callback is invoked synchronously and the verifier treats it as a normal subprogram call by adding another bpf_func_state and new frame in __check_func_call(). bpf_timer_set_callback() doesn't invoke the callback directly. The subprogram will be called asynchronously from bpf_timer_cb(). Teach the verifier to validate such async callbacks as special kind of jump by pushing verifier state into stack and let pop_stack() process it. Special care needs to be taken during state pruning. The call insn doing bpf_timer_set_callback has to be a prune_point. Otherwise short timer callbacks might not have prune points in front of bpf_timer_set_callback() which means is_state_visited() will be called after this call insn is processed in __check_func_call(). Which means that another async_cb state will be pushed to be walked later and the verifier will eventually hit BPF_COMPLEXITY_LIMIT_JMP_SEQ limit. Since push_async_cb() looks like another push_stack() branch the infinite loop detection will trigger false positive. To recognize this case mark such states as in_async_callback_fn. To distinguish infinite loop in async callback vs the same callback called with different arguments for different map and timer add async_entry_cnt to bpf_func_state. Enforce return zero from async callbacks. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-9-alexei.starovoitov@gmail.com --- include/linux/bpf_verifier.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5d3169b57e6e..242d0b1a0772 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -208,12 +208,19 @@ struct bpf_func_state { * zero == main subprog */ u32 subprogno; + /* Every bpf_timer_start will increment async_entry_cnt. + * It's used to distinguish: + * void foo(void) { for(;;); } + * void foo(void) { bpf_timer_set_callback(,foo); } + */ + u32 async_entry_cnt; + bool in_callback_fn; + bool in_async_callback_fn; /* The following fields should be last. See copy_func_state() */ int acquired_refs; struct bpf_reference_state *refs; int allocated_stack; - bool in_callback_fn; struct bpf_stack_state *stack; }; -- cgit v1.2.3 From 7ddc80a476c2d599246028af5808d15f9e24c109 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:15 -0700 Subject: bpf: Teach stack depth check about async callbacks. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Teach max stack depth checking algorithm about async callbacks that don't increase bpf program stack size. Also add sanity check that bpf_tail_call didn't sneak into async cb. It's impossible, since PTR_TO_CTX is not available in async cb, hence the program cannot contain bpf_tail_call(ctx,...); Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-10-alexei.starovoitov@gmail.com --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 242d0b1a0772..b847e1ccd10f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -406,6 +406,7 @@ struct bpf_subprog_info { bool has_tail_call; bool tail_call_reachable; bool has_ld_abs; + bool is_async_cb; }; /* single container for all structs -- cgit v1.2.3 From 7e6f3cd89f04a0a577002d5696288b482109d25c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 Jul 2021 11:43:53 +0200 Subject: bpf, x86: Store caller's ip in trampoline stack Storing caller's ip in trampoline's stack. Trampoline programs can reach the IP in (ctx - 8) address, so there's no change in program's arguments interface. The IP address is takes from [fp + 8], which is return address from the initial 'call fentry' call to trampoline. This IP address will be returned via bpf_get_func_ip helper helper, which is added in following patches. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210714094400.396467-2-jolsa@kernel.org --- include/linux/bpf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a9a4a480a6d0..94d77dc7ce35 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -579,6 +579,11 @@ struct btf_func_model { */ #define BPF_TRAMP_F_SKIP_FRAME BIT(2) +/* Store IP address of the caller on the trampoline stack, + * so it's available for trampoline's programs. + */ +#define BPF_TRAMP_F_IP_ARG BIT(3) + /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 */ -- cgit v1.2.3 From 1e37392cccdea94da635e3c6d16b21865806f619 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 Jul 2021 11:43:54 +0200 Subject: bpf: Enable BPF_TRAMP_F_IP_ARG for trampolines with call_get_func_ip Enabling BPF_TRAMP_F_IP_ARG for trampolines that actually need it. The BPF_TRAMP_F_IP_ARG adds extra 3 instructions to trampoline code and is used only by programs with bpf_get_func_ip helper, which is added in following patch and sets call_get_func_ip bit. This patch ensures that BPF_TRAMP_F_IP_ARG flag is used only for trampolines that have programs with call_get_func_ip set. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210714094400.396467-3-jolsa@kernel.org --- include/linux/filter.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 472f97074da0..ba36989f711a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -559,7 +559,8 @@ struct bpf_prog { kprobe_override:1, /* Do we override a kprobe? */ has_callchain_buf:1, /* callchain buffer allocated? */ enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ - call_get_stack:1; /* Do we call bpf_get_stack() or bpf_get_stackid() */ + call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ + call_get_func_ip:1; /* Do we call get_func_ip() */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ -- cgit v1.2.3 From 17edea21b38d047a10c189296c58aea9875d0d0a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 4 Jul 2021 12:02:42 -0700 Subject: sock_map: Relax config dependency to CONFIG_NET Currently sock_map still has Kconfig dependency on CONFIG_INET, but there is no actual functional dependency on it after we introduce ->psock_update_sk_prot(). We have to extend it to CONFIG_NET now as we are going to support AF_UNIX. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210704190252.11866-2-xiyou.wangcong@gmail.com --- include/linux/bpf.h | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 94d77dc7ce35..d25c16c365e5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1887,6 +1887,12 @@ void bpf_map_offload_map_free(struct bpf_map *map); int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); + +int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); +int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); +int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); +void sock_map_unhash(struct sock *sk); +void sock_map_close(struct sock *sk, long timeout); #else static inline int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) @@ -1919,24 +1925,6 @@ static inline int bpf_prog_test_run_syscall(struct bpf_prog *prog, { return -ENOTSUPP; } -#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ - -#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) -int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); -int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); -int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); -void sock_map_unhash(struct sock *sk); -void sock_map_close(struct sock *sk, long timeout); - -void bpf_sk_reuseport_detach(struct sock *sk); -int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, - void *value); -int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags); -#else -static inline void bpf_sk_reuseport_detach(struct sock *sk) -{ -} #ifdef CONFIG_BPF_SYSCALL static inline int sock_map_get_from_fd(const union bpf_attr *attr, @@ -1956,7 +1944,21 @@ static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void { return -EOPNOTSUPP; } +#endif /* CONFIG_BPF_SYSCALL */ +#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ +#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) +void bpf_sk_reuseport_detach(struct sock *sk); +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value); +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags); +#else +static inline void bpf_sk_reuseport_detach(struct sock *sk) +{ +} + +#ifdef CONFIG_BPF_SYSCALL static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value) { -- cgit v1.2.3 From c7603cfa04e7c3a435b31d065f7cbdc829428f6e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Jul 2021 16:06:15 -0700 Subject: bpf: Add ambient BPF runtime context stored in current b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") fixed the problem with cgroup-local storage use in BPF by pre-allocating per-CPU array of 8 cgroup storage pointers to accommodate possible BPF program preemptions and nested executions. While this seems to work good in practice, it introduces new and unnecessary failure mode in which not all BPF programs might be executed if we fail to find an unused slot for cgroup storage, however unlikely it is. It might also not be so unlikely when/if we allow sleepable cgroup BPF programs in the future. Further, the way that cgroup storage is implemented as ambiently-available property during entire BPF program execution is a convenient way to pass extra information to BPF program and helpers without requiring user code to pass around extra arguments explicitly. So it would be good to have a generic solution that can allow implementing this without arbitrary restrictions. Ideally, such solution would work for both preemptable and sleepable BPF programs in exactly the same way. This patch introduces such solution, bpf_run_ctx. It adds one pointer field (bpf_ctx) to task_struct. This field is maintained by BPF_PROG_RUN family of macros in such a way that it always stays valid throughout BPF program execution. BPF program preemption is handled by remembering previous current->bpf_ctx value locally while executing nested BPF program and restoring old value after nested BPF program finishes. This is handled by two helper functions, bpf_set_run_ctx() and bpf_reset_run_ctx(), which are supposed to be used before and after BPF program runs, respectively. Restoring old value of the pointer handles preemption, while bpf_run_ctx pointer being a property of current task_struct naturally solves this problem for sleepable BPF programs by "following" BPF program execution as it is scheduled in and out of CPU. It would even allow CPU migration of BPF programs, even though it's not currently allowed by BPF infra. This patch cleans up cgroup local storage handling as a first application. The design itself is generic, though, with bpf_run_ctx being an empty struct that is supposed to be embedded into a specific struct for a given BPF program type (bpf_cg_run_ctx in this case). Follow up patches are planned that will expand this mechanism for other uses within tracing BPF programs. To verify that this change doesn't revert the fix to the original cgroup storage issue, I ran the same repro as in the original report ([0]) and didn't get any problems. Replacing bpf_reset_run_ctx(old_run_ctx) with bpf_reset_run_ctx(NULL) triggers the issue pretty quickly (so repro does work). [0] https://lore.kernel.org/bpf/YEEvBUiJl2pJkxTd@krava/ Fixes: b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210712230615.3525979-1-andrii@kernel.org --- include/linux/bpf-cgroup.h | 54 ---------------------------------------------- include/linux/bpf.h | 54 +++++++++++++++++++++++++++++----------------- include/linux/sched.h | 3 +++ 3 files changed, 37 insertions(+), 74 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 8b77d08d4b47..a74cd1c3bd87 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -27,19 +27,6 @@ struct task_struct; extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE]; #define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type]) -#define BPF_CGROUP_STORAGE_NEST_MAX 8 - -struct bpf_cgroup_storage_info { - struct task_struct *task; - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; -}; - -/* For each cpu, permit maximum BPF_CGROUP_STORAGE_NEST_MAX number of tasks - * to use bpf cgroup storage simultaneously. - */ -DECLARE_PER_CPU(struct bpf_cgroup_storage_info, - bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); - #define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) @@ -172,44 +159,6 @@ static inline enum bpf_cgroup_storage_type cgroup_storage_type( return BPF_CGROUP_STORAGE_SHARED; } -static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage - *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) -{ - enum bpf_cgroup_storage_type stype; - int i, err = 0; - - preempt_disable(); - for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { - if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL)) - continue; - - this_cpu_write(bpf_cgroup_storage_info[i].task, current); - for_each_cgroup_storage_type(stype) - this_cpu_write(bpf_cgroup_storage_info[i].storage[stype], - storage[stype]); - goto out; - } - err = -EBUSY; - WARN_ON_ONCE(1); - -out: - preempt_enable(); - return err; -} - -static inline void bpf_cgroup_storage_unset(void) -{ - int i; - - for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { - if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) - continue; - - this_cpu_write(bpf_cgroup_storage_info[i].task, NULL); - return; - } -} - struct bpf_cgroup_storage * cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, void *key, bool locked); @@ -487,9 +436,6 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } -static inline int bpf_cgroup_storage_set( - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { return 0; } -static inline void bpf_cgroup_storage_unset(void) {} static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0edff8f5177e..978ebd16ae60 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1142,38 +1142,40 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *include_prog, struct bpf_prog_array **new_array); +struct bpf_run_ctx {}; + +struct bpf_cg_run_ctx { + struct bpf_run_ctx run_ctx; + struct bpf_prog_array_item *prog_item; +}; + /* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */ #define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE (1 << 0) /* BPF program asks to set CN on the packet. */ #define BPF_RET_SET_CN (1 << 0) -/* For BPF_PROG_RUN_ARRAY_FLAGS and __BPF_PROG_RUN_ARRAY, - * if bpf_cgroup_storage_set() failed, the rest of programs - * will not execute. This should be a really rare scenario - * as it requires BPF_CGROUP_STORAGE_NEST_MAX number of - * preemptions all between bpf_cgroup_storage_set() and - * bpf_cgroup_storage_unset() on the same cpu. - */ #define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags) \ ({ \ struct bpf_prog_array_item *_item; \ struct bpf_prog *_prog; \ struct bpf_prog_array *_array; \ + struct bpf_run_ctx *old_run_ctx; \ + struct bpf_cg_run_ctx run_ctx; \ u32 _ret = 1; \ u32 func_ret; \ migrate_disable(); \ rcu_read_lock(); \ _array = rcu_dereference(array); \ _item = &_array->items[0]; \ + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); \ while ((_prog = READ_ONCE(_item->prog))) { \ - if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ - break; \ + run_ctx.prog_item = _item; \ func_ret = func(_prog, ctx); \ _ret &= (func_ret & 1); \ - *(ret_flags) |= (func_ret >> 1); \ - bpf_cgroup_storage_unset(); \ + *(ret_flags) |= (func_ret >> 1); \ _item++; \ } \ + bpf_reset_run_ctx(old_run_ctx); \ rcu_read_unlock(); \ migrate_enable(); \ _ret; \ @@ -1184,6 +1186,8 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog_array_item *_item; \ struct bpf_prog *_prog; \ struct bpf_prog_array *_array; \ + struct bpf_run_ctx *old_run_ctx; \ + struct bpf_cg_run_ctx run_ctx; \ u32 _ret = 1; \ migrate_disable(); \ rcu_read_lock(); \ @@ -1191,17 +1195,13 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, if (unlikely(check_non_null && !_array))\ goto _out; \ _item = &_array->items[0]; \ - while ((_prog = READ_ONCE(_item->prog))) { \ - if (!set_cg_storage) { \ - _ret &= func(_prog, ctx); \ - } else { \ - if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ - break; \ - _ret &= func(_prog, ctx); \ - bpf_cgroup_storage_unset(); \ - } \ + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);\ + while ((_prog = READ_ONCE(_item->prog))) { \ + run_ctx.prog_item = _item; \ + _ret &= func(_prog, ctx); \ _item++; \ } \ + bpf_reset_run_ctx(old_run_ctx); \ _out: \ rcu_read_unlock(); \ migrate_enable(); \ @@ -1284,6 +1284,20 @@ static inline void bpf_enable_instrumentation(void) migrate_enable(); } +static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) +{ + struct bpf_run_ctx *old_ctx; + + old_ctx = current->bpf_ctx; + current->bpf_ctx = new_ctx; + return old_ctx; +} + +static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) +{ + current->bpf_ctx = old_ctx; +} + extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; diff --git a/include/linux/sched.h b/include/linux/sched.h index ec8d07d88641..c64119aa2e60 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -42,6 +42,7 @@ struct backing_dev_info; struct bio_list; struct blk_plug; struct bpf_local_storage; +struct bpf_run_ctx; struct capture_control; struct cfs_rq; struct fs_struct; @@ -1379,6 +1380,8 @@ struct task_struct { #ifdef CONFIG_BPF_SYSCALL /* Used by BPF task local storage */ struct bpf_local_storage __rcu *bpf_storage; + /* Used for BPF run context */ + struct bpf_run_ctx *bpf_ctx; #endif #ifdef CONFIG_GCC_PLUGIN_STACKLEAK -- cgit v1.2.3 From 96cd2dd65bb0b94c908f2df32bba7350fc1b954e Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Mon, 28 Dec 2020 10:38:12 +0200 Subject: net/mlx5: Add DCS caps & fields support This fields will be needed when adding a support for DCS offload max_dci_stream_channels - maximum DCI stream channels supported per DCI. max_dci_errored_streams - maximum DCI error stream channels supported per DCI before a DCI move to error state. Signed-off-by: Lior Nahmanson Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index b0009aa3647f..3dd6641e942c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1651,7 +1651,13 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 max_geneve_tlv_option_data_len[0x5]; u8 reserved_at_570[0x10]; - u8 reserved_at_580[0x33]; + u8 reserved_at_580[0xb]; + u8 log_max_dci_stream_channels[0x5]; + u8 reserved_at_590[0x3]; + u8 log_max_dci_errored_streams[0x5]; + u8 reserved_at_598[0x8]; + + u8 reserved_at_5a0[0x13]; u8 log_max_dek[0x5]; u8 reserved_at_5b8[0x4]; u8 mini_cqe_resp_stride_index[0x1]; @@ -3020,10 +3026,12 @@ struct mlx5_ifc_qpc_bits { u8 reserved_at_3c0[0x8]; u8 next_send_psn[0x18]; - u8 reserved_at_3e0[0x8]; + u8 reserved_at_3e0[0x3]; + u8 log_num_dci_stream_channels[0x5]; u8 cqn_snd[0x18]; - u8 reserved_at_400[0x8]; + u8 reserved_at_400[0x3]; + u8 log_num_dci_errored_streams[0x5]; u8 deth_sqpn[0x18]; u8 reserved_at_420[0x20]; -- cgit v1.2.3 From 0fac6aa098edf91ba65370da03811d9aba5715a9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 19 Jul 2021 20:14:42 +0300 Subject: net: dsa: sja1105: delete the best_effort_vlan_filtering mode Simply put, the best-effort VLAN filtering mode relied on VLAN retagging from a bridge VLAN towards a tag_8021q sub-VLAN in order to be able to decode the source port in the tagger, but the VLAN retagging implementation inside the sja1105 chips is not the best and we were relying on marginal operating conditions. The most notable limitation of the best-effort VLAN filtering mode is its incapacity to treat this case properly: ip link add br0 type bridge vlan_filtering 1 ip link set swp2 master br0 ip link set swp4 master br0 bridge vlan del dev swp4 vid 1 bridge vlan add dev swp4 vid 1 pvid When sending an untagged packet through swp2, the expectation is for it to be forwarded to swp4 as egress-tagged (so it will contain VLAN ID 1 on egress). But the switch will send it as egress-untagged. There was an attempt to fix this here: https://patchwork.kernel.org/project/netdevbpf/patch/20210407201452.1703261-2-olteanv@gmail.com/ but it failed miserably because it broke PTP RX timestamping, in a way that cannot be corrected due to hardware issues related to VLAN retagging. So with either PTP broken or pushing VLAN headers on egress for untagged packets being broken, the sad reality is that the best-effort VLAN filtering code is broken. Delete it. Note that this means there will be a temporary loss of functionality in this driver until it is replaced with something better (network stack RX/TX capability for "mode 2" as described in Documentation/networking/dsa/sja1105.rst, the "port under VLAN-aware bridge" case). We simply cannot keep this code until that driver rework is done, it is super bloated and tangled with tag_8021q. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 9 +-------- include/linux/dsa/sja1105.h | 1 - 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 1587961f1a7b..608607f904a5 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -35,8 +35,6 @@ struct dsa_8021q_context { __be16 proto; }; -#define DSA_8021Q_N_SUBVLAN 8 - int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled); int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port, @@ -50,21 +48,16 @@ int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port, struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci); -void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, - int *subvlan); +void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id); u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port); u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port); -u16 dsa_8021q_rx_vid_subvlan(struct dsa_switch *ds, int port, u16 subvlan); - int dsa_8021q_rx_switch_id(u16 vid); int dsa_8021q_rx_source_port(u16 vid); -u16 dsa_8021q_rx_subvlan(u16 vid); - bool vid_is_dsa_8021q_rxvlan(u16 vid); bool vid_is_dsa_8021q_txvlan(u16 vid); diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index b6089b88314c..0eadc7ac44ec 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -59,7 +59,6 @@ struct sja1105_skb_cb { ((struct sja1105_skb_cb *)((skb)->cb)) struct sja1105_port { - u16 subvlan_map[DSA_8021Q_N_SUBVLAN]; struct kthread_worker *xmit_worker; struct kthread_work xmit_work; struct sk_buff_head xmit_queue; -- cgit v1.2.3 From 8afbea187d31e4e9beb83b7a316d16b7879c2799 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 19 Jul 2021 20:14:45 +0300 Subject: net: dsa: tag_8021q: remove struct packet_type declaration This is no longer necessary since tag_8021q doesn't register itself as a full-blown tagger anymore. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 608607f904a5..5f01dea7d5b6 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -11,7 +11,6 @@ struct dsa_switch; struct sk_buff; struct net_device; -struct packet_type; struct dsa_8021q_context; struct dsa_8021q_crosschip_link { -- cgit v1.2.3 From cedf467064b6b8764fdb2ee6b9e3d18bc81a9d8f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 19 Jul 2021 20:14:46 +0300 Subject: net: dsa: tag_8021q: create dsa_tag_8021q_{register,unregister} helpers In preparation of moving tag_8021q to core DSA, move all initialization and teardown related to tag_8021q which is currently done by drivers in 2 functions called "register" and "unregister". These will gather more functionality in future patches, which will better justify the chosen naming scheme. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 5f01dea7d5b6..9945898a90c3 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -34,6 +34,12 @@ struct dsa_8021q_context { __be16 proto; }; +struct dsa_8021q_context *dsa_tag_8021q_register(struct dsa_switch *ds, + const struct dsa_8021q_ops *ops, + __be16 proto); + +void dsa_tag_8021q_unregister(struct dsa_8021q_context *ctx); + int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled); int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port, -- cgit v1.2.3 From d7b1fd520d5d4271f4ab9b1671afbdcd868039d3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 19 Jul 2021 20:14:48 +0300 Subject: net: dsa: let the core manage the tag_8021q context The basic problem description is as follows: Be there 3 switches in a daisy chain topology: | sw0p0 sw0p1 sw0p2 sw0p3 sw0p4 [ user ] [ user ] [ user ] [ dsa ] [ cpu ] | +---------+ | sw1p0 sw1p1 sw1p2 sw1p3 sw1p4 [ user ] [ user ] [ user ] [ dsa ] [ dsa ] | +---------+ | sw2p0 sw2p1 sw2p2 sw2p3 sw2p4 [ user ] [ user ] [ user ] [ user ] [ dsa ] The CPU will not be able to ping through the user ports of the bottom-most switch (like for example sw2p0), simply because tag_8021q was not coded up for this scenario - it has always assumed DSA switch trees with a single switch. To add support for the topology above, we must admit that the RX VLAN of sw2p0 must be added on some ports of switches 0 and 1 as well. This is in fact a textbook example of thing that can use the cross-chip notifier framework that DSA has set up in switch.c. There is only one problem: core DSA (switch.c) is not able right now to make the connection between a struct dsa_switch *ds and a struct dsa_8021q_context *ctx. Right now, it is drivers who call into tag_8021q.c and always provide a struct dsa_8021q_context *ctx pointer, and tag_8021q.c calls them back with the .tag_8021q_vlan_{add,del} methods. But with cross-chip notifiers, it is possible for tag_8021q to call drivers without drivers having ever asked for anything. A good example is right above: when sw2p0 wants to set itself up for tag_8021q, the .tag_8021q_vlan_add method needs to be called for switches 1 and 0, so that they transport sw2p0's VLANs towards the CPU without dropping them. So instead of letting drivers manage the tag_8021q context, add a tag_8021q_ctx pointer inside of struct dsa_switch, which will be populated when dsa_tag_8021q_register() returns success. The patch is fairly long-winded because we are partly reverting commit 5899ee367ab3 ("net: dsa: tag_8021q: add a context structure") which made the driver-facing tag_8021q API use "ctx" instead of "ds". Now that we can access "ctx" directly from "ds", this is no longer needed. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 9945898a90c3..77939c0c8dd5 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -34,20 +34,20 @@ struct dsa_8021q_context { __be16 proto; }; -struct dsa_8021q_context *dsa_tag_8021q_register(struct dsa_switch *ds, - const struct dsa_8021q_ops *ops, - __be16 proto); +int dsa_tag_8021q_register(struct dsa_switch *ds, + const struct dsa_8021q_ops *ops, + __be16 proto); -void dsa_tag_8021q_unregister(struct dsa_8021q_context *ctx); +void dsa_tag_8021q_unregister(struct dsa_switch *ds); -int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled); +int dsa_8021q_setup(struct dsa_switch *ds, bool enabled); -int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port, - struct dsa_8021q_context *other_ctx, +int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, int other_port); -int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port, - struct dsa_8021q_context *other_ctx, +int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, int other_port); struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, -- cgit v1.2.3 From 5da11eb407340233a6111c563419e19685a062a4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 19 Jul 2021 20:14:49 +0300 Subject: net: dsa: make tag_8021q operations part of the core Make tag_8021q a more central element of DSA and move the 2 driver specific operations outside of struct dsa_8021q_context (which is supposed to hold dynamic data and not really constant function pointers). Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 77939c0c8dd5..0bda08fb2f16 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -21,22 +21,14 @@ struct dsa_8021q_crosschip_link { refcount_t refcount; }; -struct dsa_8021q_ops { - int (*vlan_add)(struct dsa_switch *ds, int port, u16 vid, u16 flags); - int (*vlan_del)(struct dsa_switch *ds, int port, u16 vid); -}; - struct dsa_8021q_context { - const struct dsa_8021q_ops *ops; struct dsa_switch *ds; struct list_head crosschip_links; /* EtherType of RX VID, used for filtering on master interface */ __be16 proto; }; -int dsa_tag_8021q_register(struct dsa_switch *ds, - const struct dsa_8021q_ops *ops, - __be16 proto); +int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto); void dsa_tag_8021q_unregister(struct dsa_switch *ds); -- cgit v1.2.3 From 328621f6131f667c5c328bb72d45442fd76efb81 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 19 Jul 2021 20:14:50 +0300 Subject: net: dsa: tag_8021q: absorb dsa_8021q_setup into dsa_tag_8021q_{,un}register Right now, setting up tag_8021q is a 2-step operation for a driver, first the context structure needs to be created, then the VLANs need to be installed on the ports. A similar thing is true for teardown. Merge the 2 steps into the register/unregister methods, to be as transparent as possible for the driver as to what tag_8021q does behind the scenes. This also gets rid of the funny "bool setup == true means setup, == false means teardown" API that tag_8021q used to expose. Note that dsa_tag_8021q_register() must be called at least in the .setup() driver method and never earlier (like in the driver probe function). This is because the DSA switch tree is not initialized at probe time, and the cross-chip notifiers will not work. For symmetry with .setup(), the unregister method should be put in .teardown(). Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 0bda08fb2f16..9cf2c99eb668 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -32,8 +32,6 @@ int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto); void dsa_tag_8021q_unregister(struct dsa_switch *ds); -int dsa_8021q_setup(struct dsa_switch *ds, bool enabled); - int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, struct dsa_switch *other_ds, int other_port); -- cgit v1.2.3 From c64b9c05045a21a5258f6dbd81d94a2a22ff73a2 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 19 Jul 2021 20:14:52 +0300 Subject: net: dsa: tag_8021q: add proper cross-chip notifier support The big problem which mandates cross-chip notifiers for tag_8021q is this: | sw0p0 sw0p1 sw0p2 sw0p3 sw0p4 [ user ] [ user ] [ user ] [ dsa ] [ cpu ] | +---------+ | sw1p0 sw1p1 sw1p2 sw1p3 sw1p4 [ user ] [ user ] [ user ] [ dsa ] [ dsa ] | +---------+ | sw2p0 sw2p1 sw2p2 sw2p3 sw2p4 [ user ] [ user ] [ user ] [ dsa ] [ dsa ] When the user runs: ip link add br0 type bridge ip link set sw0p0 master br0 ip link set sw2p0 master br0 It doesn't work. This is because dsa_8021q_crosschip_bridge_join() assumes that "ds" and "other_ds" are at most 1 hop away from each other, so it is sufficient to add the RX VLAN of {ds, port} into {other_ds, other_port} and vice versa and presto, the cross-chip link works. When there is another switch in the middle, such as in this case switch 1 with its DSA links sw1p3 and sw1p4, somebody needs to tell it about these VLANs too. Which is exactly why the problem is quadratic: when a port joins a bridge, for each port in the tree that's already in that same bridge we notify a tag_8021q VLAN addition of that port's RX VLAN to the entire tree. It is a very complicated web of VLANs. It must be mentioned that currently we install tag_8021q VLANs on too many ports (DSA links - to be precise, on all of them). For example, when sw2p0 joins br0, and assuming sw1p0 was part of br0 too, we add the RX VLAN of sw2p0 on the DSA links of switch 0 too, even though there isn't any port of switch 0 that is a member of br0 (at least yet). In theory we could notify only the switches which sit in between the port joining the bridge and the port reacting to that bridge_join event. But in practice that is impossible, because of the way 'link' properties are described in the device tree. The DSA bindings require DT writers to list out not only the real/physical DSA links, but in fact the entire routing table, like for example switch 0 above will have: sw0p3: port@3 { link = <&sw1p4 &sw2p4>; }; This was done because: /* TODO: ideally DSA ports would have a single dp->link_dp member, * and no dst->rtable nor this struct dsa_link would be needed, * but this would require some more complex tree walking, * so keep it stupid at the moment and list them all. */ but it is a perfect example of a situation where too much information is actively detrimential, because we are now in the position where we cannot distinguish a real DSA link from one that is put there to avoid the 'complex tree walking'. And because DT is ABI, there is not much we can change. And because we do not know which DSA links are real and which ones aren't, we can't really know if DSA switch A is in the data path between switches B and C, in the general case. So this is why tag_8021q RX VLANs are added on all DSA links, and probably why it will never change. On the other hand, at least the number of additions/deletions is well balanced, and this means that once we implement reference counting at the cross-chip notifier level a la fdb/mdb, there is absolutely zero need for a struct dsa_8021q_crosschip_link, it's all self-managing. In fact, with the tag_8021q notifiers emitted from the bridge join notifiers, it becomes so generic that sja1105 does not need to do anything anymore, we can just delete its implementation of the .crosschip_bridge_{join,leave} methods. Among other things we can simply delete is the home-grown implementation of sja1105_notify_crosschip_switches(). The reason why that is wrong is because it is not quadratic - it only covers remote switches to which we have a cross-chip bridging link and that does not cover in-between switches. This deletion is part of the same patch because sja1105 used to poke deep inside the guts of the tag_8021q context in order to do that. Because the cross-chip links went away, so needs the sja1105 code. Last but not least, dsa_8021q_setup_port() is simplified (and also renamed). Because our TAG_8021Q_VLAN_ADD notifier is designed to react on the CPU port too, the four dsa_8021q_vid_apply() calls: - 1 for RX VLAN on user port - 1 for the user port's RX VLAN on the CPU port - 1 for TX VLAN on user port - 1 for the user port's TX VLAN on the CPU port now get squashed into only 2 notifier calls via dsa_port_tag_8021q_vlan_add. And because the notifiers to add and to delete a tag_8021q VLAN are distinct, now we finally break up the port setup and teardown into separate functions instead of relying on a "bool enabled" flag which tells us what to do. Arguably it should have been this way from the get go. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 9cf2c99eb668..ec5abfcdefd1 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -11,19 +11,17 @@ struct dsa_switch; struct sk_buff; struct net_device; -struct dsa_8021q_context; -struct dsa_8021q_crosschip_link { +struct dsa_tag_8021q_vlan { struct list_head list; int port; - struct dsa_8021q_context *other_ctx; - int other_port; + u16 vid; refcount_t refcount; }; struct dsa_8021q_context { struct dsa_switch *ds; - struct list_head crosschip_links; + struct list_head vlans; /* EtherType of RX VID, used for filtering on master interface */ __be16 proto; }; @@ -32,14 +30,6 @@ int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto); void dsa_tag_8021q_unregister(struct dsa_switch *ds); -int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port); - -int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port); - struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci); -- cgit v1.2.3 From 8b72b301b442907742c1af1b8fcb52e351a2aac1 Mon Sep 17 00:00:00 2001 From: Xu Liang Date: Mon, 19 Jul 2021 13:32:11 +0800 Subject: net: phy: add API to read 802.3-c45 IDs Add API to read 802.3-c45 IDs so that C22/C45 mixed device can use C45 APIs without failing ID checks. Signed-off-by: Xu Liang Acked-by: Hauke Mehrtens Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 3b80dc3ed68b..736e1d1a47c4 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1431,6 +1431,7 @@ static inline int phy_device_register(struct phy_device *phy) static inline void phy_device_free(struct phy_device *phydev) { } #endif /* CONFIG_PHYLIB */ void phy_device_remove(struct phy_device *phydev); +int phy_get_c45_ids(struct phy_device *phydev); int phy_init_hw(struct phy_device *phydev); int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); -- cgit v1.2.3 From 9ee11f0fff205b4b3df9750bff5e94f97c71b6a0 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Tue, 20 Jul 2021 21:42:57 +0200 Subject: ipv6: ioam: Data plane support for Pre-allocated Trace Implement support for processing the IOAM Pre-allocated Trace with IPv6, see [1] and [2]. Introduce a new IPv6 Hop-by-Hop TLV option, see IANA [3]. A new per-interface sysctl is introduced. The value is a boolean to accept (=1) or ignore (=0, by default) IPv6 IOAM options on ingress for an interface: - net.ipv6.conf.XXX.ioam6_enabled Two other sysctls are introduced to define IOAM IDs, represented by an integer. They are respectively per-namespace and per-interface: - net.ipv6.ioam6_id - net.ipv6.conf.XXX.ioam6_id The value of the first one represents the IOAM ID of the node itself (u32; max and default value = U32_MAX>>8, due to hop limit concatenation) while the other represents the IOAM ID of an interface (u16; max and default value = U16_MAX). Each "ioam6_id" sysctl has a "_wide" equivalent: - net.ipv6.ioam6_id_wide - net.ipv6.conf.XXX.ioam6_id_wide The value of the first one represents the wide IOAM ID of the node itself (u64; max and default value = U64_MAX>>8, due to hop limit concatenation) while the other represents the wide IOAM ID of an interface (u32; max and default value = U32_MAX). The use of short and wide equivalents is not exclusive, a deployment could choose to leverage both. For example, net.ipv6.conf.XXX.ioam6_id (short format) could be an identifier for a physical interface, whereas net.ipv6.conf.XXX.ioam6_id_wide (wide format) could be an identifier for a logical sub-interface. Documentation about new sysctls is provided at the end of this patchset. Two relativistic hash tables are used: one for IOAM namespaces, the other for IOAM schemas. A namespace can only have a single active schema and a schema can only be attached to a single namespace (1:1 relationship). [1] https://tools.ietf.org/html/draft-ietf-ippm-ioam-ipv6-options [2] https://tools.ietf.org/html/draft-ietf-ippm-ioam-data [3] https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml#ipv6-parameters-2 Signed-off-by: Justin Iurman Signed-off-by: David S. Miller --- include/linux/ioam6.h | 13 +++++++++++++ include/linux/ipv6.h | 3 +++ 2 files changed, 16 insertions(+) create mode 100644 include/linux/ioam6.h (limited to 'include/linux') diff --git a/include/linux/ioam6.h b/include/linux/ioam6.h new file mode 100644 index 000000000000..94a24b36998f --- /dev/null +++ b/include/linux/ioam6.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * IPv6 IOAM + * + * Author: + * Justin Iurman + */ +#ifndef _LINUX_IOAM6_H +#define _LINUX_IOAM6_H + +#include + +#endif /* _LINUX_IOAM6_H */ diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 70b2ad3b9884..ef4a69865737 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -76,6 +76,9 @@ struct ipv6_devconf { __s32 disable_policy; __s32 ndisc_tclass; __s32 rpl_seg_enabled; + __u32 ioam6_id; + __u32 ioam6_id_wide; + __u8 ioam6_enabled; struct ctl_table_header *sysctl_header; }; -- cgit v1.2.3 From 8c6f6fa6772696be0c047a711858084b38763728 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Tue, 20 Jul 2021 21:42:58 +0200 Subject: ipv6: ioam: IOAM Generic Netlink API Add Generic Netlink commands to allow userspace to configure IOAM namespaces and schemas. The target is iproute2 and the patch is ready. It will be posted as soon as this patchset is merged. Here is an overview: $ ip ioam Usage: ip ioam { COMMAND | help } ip ioam namespace show ip ioam namespace add ID [ data DATA32 ] [ wide DATA64 ] ip ioam namespace del ID ip ioam schema show ip ioam schema add ID DATA ip ioam schema del ID ip ioam namespace set ID schema { ID | none } Signed-off-by: Justin Iurman Signed-off-by: David S. Miller --- include/linux/ioam6_genl.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 include/linux/ioam6_genl.h (limited to 'include/linux') diff --git a/include/linux/ioam6_genl.h b/include/linux/ioam6_genl.h new file mode 100644 index 000000000000..176e67919de3 --- /dev/null +++ b/include/linux/ioam6_genl.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * IPv6 IOAM Generic Netlink API + * + * Author: + * Justin Iurman + */ +#ifndef _LINUX_IOAM6_GENL_H +#define _LINUX_IOAM6_GENL_H + +#include + +#endif /* _LINUX_IOAM6_GENL_H */ -- cgit v1.2.3 From 3edede08ff37c6a9370510508d5eeb54890baf47 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Tue, 20 Jul 2021 21:42:59 +0200 Subject: ipv6: ioam: Support for IOAM injection with lwtunnels Add support for the IOAM inline insertion (only for the host-to-host use case) which is per-route configured with lightweight tunnels. The target is iproute2 and the patch is ready. It will be posted as soon as this patchset is merged. Here is an overview: $ ip -6 ro ad fc00::1/128 encap ioam6 trace type 0x800000 ns 1 size 12 dev eth0 This example configures an IOAM Pre-allocated Trace option attached to the fc00::1/128 prefix. The IOAM namespace (ns) is 1, the size of the pre-allocated trace data block is 12 octets (size) and only the first IOAM data (bit 0: hop_limit + node id) is included in the trace (type) represented as a bitfield. The reason why the in-transit (IPv6-in-IPv6 encapsulation) use case is not implemented is explained on the patchset cover. Signed-off-by: Justin Iurman Signed-off-by: David S. Miller --- include/linux/ioam6_iptunnel.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 include/linux/ioam6_iptunnel.h (limited to 'include/linux') diff --git a/include/linux/ioam6_iptunnel.h b/include/linux/ioam6_iptunnel.h new file mode 100644 index 000000000000..07d9dfedd29d --- /dev/null +++ b/include/linux/ioam6_iptunnel.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * IPv6 IOAM Lightweight Tunnel API + * + * Author: + * Justin Iurman + */ +#ifndef _LINUX_IOAM6_IPTUNNEL_H +#define _LINUX_IOAM6_IPTUNNEL_H + +#include + +#endif /* _LINUX_IOAM6_IPTUNNEL_H */ -- cgit v1.2.3 From 2f5dc00f7a3ea669fd387ce79ffca92bff361550 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 21 Jul 2021 19:24:01 +0300 Subject: net: bridge: switchdev: let drivers inform which bridge ports are offloaded On reception of an skb, the bridge checks if it was marked as 'already forwarded in hardware' (checks if skb->offload_fwd_mark == 1), and if it is, it assigns the source hardware domain of that skb based on the hardware domain of the ingress port. Then during forwarding, it enforces that the egress port must have a different hardware domain than the ingress one (this is done in nbp_switchdev_allowed_egress). Non-switchdev drivers don't report any physical switch id (neither through devlink nor .ndo_get_port_parent_id), therefore the bridge assigns them a hardware domain of 0, and packets coming from them will always have skb->offload_fwd_mark = 0. So there aren't any restrictions. Problems appear due to the fact that DSA would like to perform software fallback for bonding and team interfaces that the physical switch cannot offload. +-- br0 ---+ / / | \ / / | \ / | | bond0 / | | / \ swp0 swp1 swp2 swp3 swp4 There, it is desirable that the presence of swp3 and swp4 under a non-offloaded LAG does not preclude us from doing hardware bridging beteen swp0, swp1 and swp2. The bandwidth of the CPU is often times high enough that software bridging between {swp0,swp1,swp2} and bond0 is not impractical. But this creates an impossible paradox given the current way in which port hardware domains are assigned. When the driver receives a packet from swp0 (say, due to flooding), it must set skb->offload_fwd_mark to something. - If we set it to 0, then the bridge will forward it towards swp1, swp2 and bond0. But the switch has already forwarded it towards swp1 and swp2 (not to bond0, remember, that isn't offloaded, so as far as the switch is concerned, ports swp3 and swp4 are not looking up the FDB, and the entire bond0 is a destination that is strictly behind the CPU). But we don't want duplicated traffic towards swp1 and swp2, so it's not ok to set skb->offload_fwd_mark = 0. - If we set it to 1, then the bridge will not forward the skb towards the ports with the same switchdev mark, i.e. not to swp1, swp2 and bond0. Towards swp1 and swp2 that's ok, but towards bond0? It should have forwarded the skb there. So the real issue is that bond0 will be assigned the same hardware domain as {swp0,swp1,swp2}, because the function that assigns hardware domains to bridge ports, nbp_switchdev_add(), recurses through bond0's lower interfaces until it finds something that implements devlink (calls dev_get_port_parent_id with bool recurse = true). This is a problem because the fact that bond0 can be offloaded by swp3 and swp4 in our example is merely an assumption. A solution is to give the bridge explicit hints as to what hardware domain it should use for each port. Currently, the bridging offload is very 'silent': a driver registers a netdevice notifier, which is put on the netns's notifier chain, and which sniffs around for NETDEV_CHANGEUPPER events where the upper is a bridge, and the lower is an interface it knows about (one registered by this driver, normally). Then, from within that notifier, it does a bunch of stuff behind the bridge's back, without the bridge necessarily knowing that there's somebody offloading that port. It looks like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v call_netdevice_notifiers | v dsa_slave_netdevice_event | v oh, hey! it's for me! | v .port_bridge_join What we do to solve the conundrum is to be less silent, and change the switchdev drivers to present themselves to the bridge. Something like this: ip link set swp0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | hardware domain for v | this port, and zero dsa_slave_netdevice_event | if I got nothing. | | v | oh, hey! it's for me! | | | v | .port_bridge_join | | | +------------------------+ switchdev_bridge_port_offload(swp0, swp0) Then stacked interfaces (like bond0 on top of swp3/swp4) would be treated differently in DSA, depending on whether we can or cannot offload them. The offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge: Aye! I'll use this call_netdevice_notifiers ^ ppid as the | | switchdev mark for v | bond0. dsa_slave_netdevice_event | Coincidentally (or not), | | bond0 and swp0, swp1, swp2 v | all have the same switchdev hmm, it's not quite for me, | mark now, since the ASIC but my driver has already | is able to forward towards called .port_lag_join | all these ports in hw. for it, because I have | a port with dp->lag_dev == bond0. | | | v | .port_bridge_join | for swp3 and swp4 | | | +------------------------+ switchdev_bridge_port_offload(bond0, swp3) switchdev_bridge_port_offload(bond0, swp4) And the non-offload case: ip link set bond0 master br0 | v br_add_if() calls netdev_master_upper_dev_link() | v bridge waiting: call_netdevice_notifiers ^ huh, switchdev_bridge_port_offload | | wasn't called, okay, I'll use a v | hwdom of zero for this one. dsa_slave_netdevice_event : Then packets received on swp0 will | : not be software-forwarded towards v : swp1, but they will towards bond0. it's not for me, but bond0 is an upper of swp3 and swp4, but their dp->lag_dev is NULL because they couldn't offload it. Basically we can draw the conclusion that the lowers of a bridge port can come and go, so depending on the configuration of lowers for a bridge port, it can dynamically toggle between offloaded and unoffloaded. Therefore, we need an equivalent switchdev_bridge_port_unoffload too. This patch changes the way any switchdev driver interacts with the bridge. From now on, everybody needs to call switchdev_bridge_port_offload and switchdev_bridge_port_unoffload, otherwise the bridge will treat the port as non-offloaded and allow software flooding to other ports from the same ASIC. Note that these functions lay the ground for a more complex handshake between switchdev drivers and the bridge in the future. For drivers that will request a replay of the switchdev objects when they offload and unoffload a bridge port (DSA, dpaa2-switch, ocelot), we place the call to switchdev_bridge_port_unoffload() strategically inside the NETDEV_PRECHANGEUPPER notifier's code path, and not inside NETDEV_CHANGEUPPER. This is because the switchdev object replay helpers need the netdev adjacency lists to be valid, and that is only true in NETDEV_PRECHANGEUPPER. Cc: Vadym Kochan Cc: Taras Chornyi Cc: Ioana Ciornei Cc: Lars Povlsen Cc: Steen Hegelund Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil Cc: Alexandre Belloni Cc: Grygorii Strashko Signed-off-by: Vladimir Oltean Tested-by: Ioana Ciornei # dpaa2-switch: regression Acked-by: Ioana Ciornei # dpaa2-switch Tested-by: Horatiu Vultur # ocelot-switch Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index b651c5e32a28..ce413eca527e 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -206,4 +206,25 @@ static inline int br_fdb_replay(const struct net_device *br_dev, } #endif +#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_NET_SWITCHDEV) + +int switchdev_bridge_port_offload(struct net_device *brport_dev, + struct net_device *dev, + struct netlink_ext_ack *extack); +void switchdev_bridge_port_unoffload(struct net_device *brport_dev); + +#else + +static inline int switchdev_bridge_port_offload(struct net_device *brport_dev, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + return -EINVAL; +} + +static inline void switchdev_bridge_port_unoffload(struct net_device *brport_dev) +{ +} +#endif + #endif -- cgit v1.2.3 From 4e51bf44a03af6fa19a39a36ea8fedfacb8ccadf Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 21 Jul 2021 19:24:03 +0300 Subject: net: bridge: move the switchdev object replay helpers to "push" mode Starting with commit 4f2673b3a2b6 ("net: bridge: add helper to replay port and host-joined mdb entries"), DSA has introduced some bridge helpers that replay switchdev events (FDB/MDB/VLAN additions and deletions) that can be lost by the switchdev drivers in a variety of circumstances: - an IP multicast group was host-joined on the bridge itself before any switchdev port joined the bridge, leading to the host MDB entries missing in the hardware database. - during the bridge creation process, the MAC address of the bridge was added to the FDB as an entry pointing towards the bridge device itself, but with no switchdev ports being part of the bridge yet, this local FDB entry would remain unknown to the switchdev hardware database. - a VLAN/FDB/MDB was added to a bridge port that is a LAG interface, before any switchdev port joined that LAG, leading to the hardware database missing those entries. - a switchdev port left a LAG that is a bridge port, while the LAG remained part of the bridge, and all FDB/MDB/VLAN entries remained installed in the hardware database of the switchdev port. Also, since commit 0d2cfbd41c4a ("net: bridge: ignore switchdev events for LAG ports which didn't request replay"), DSA introduced a method, based on a const void *ctx, to ensure that two switchdev ports under the same LAG that is a bridge port do not see the same MDB/VLAN entry being replayed twice by the bridge, once for every bridge port that joins the LAG. With so many ordering corner cases being possible, it seems unreasonable to expect a switchdev driver writer to get it right from the first try. Therefore, now that DSA has experimented with the bridge replay helpers for a little bit, we can move the code to the bridge driver where it is more readily available to all switchdev drivers. To convert the switchdev object replay helpers from "pull mode" (where the driver asks for them) to a "push mode" (where the bridge offers them automatically), the biggest problem is that the bridge needs to be aware when a switchdev port joins and leaves, even when the switchdev is only indirectly a bridge port (for example when the bridge port is a LAG upper of the switchdev). Luckily, we already have a hook for that, in the form of the newly introduced switchdev_bridge_port_offload() and switchdev_bridge_port_unoffload() calls. These offer a natural place for hooking the object addition and deletion replays. Extend the above 2 functions with: - pointers to the switchdev atomic notifier (for FDB replays) and the blocking notifier (for MDB and VLAN replays). - the "const void *ctx" argument required for drivers to be able to disambiguate between which port is targeted, when multiple ports are lowers of the same LAG that is a bridge port. Most of the drivers pass NULL to this argument, except the ones that support LAG offload and have the proper context check already in place in the switchdev blocking notifier handler. Also unexport the replay helpers, since nobody except the bridge calls them directly now. Note that: (a) we abuse the terminology slightly, because FDB entries are not "switchdev objects", but we count them as objects nonetheless. With no direct way to prove it, I think they are not modeled as switchdev objects because those can only be installed by the bridge to the hardware (as opposed to FDB entries which can be propagated in the other direction too). This is merely an abuse of terms, FDB entries are replayed too, despite not being objects. (b) the bridge does not attempt to sync port attributes to newly joined ports, just the countable stuff (the objects). The reason for this is simple: no universal and symmetric way to sync and unsync them is known. For example, VLAN filtering: what to do on unsync, disable or leave it enabled? Similarly, STP state, ageing timer, etc etc. What a switchdev port does when it becomes standalone again is not really up to the bridge's competence, and the driver should deal with it. On the other hand, replaying deletions of switchdev objects can be seen a matter of cleanup and therefore be treated by the bridge, hence this patch. We make the replay helpers opt-in for drivers, because they might not bring immediate benefits for them: - nbp_vlan_init() is called _after_ netdev_master_upper_dev_link(), so br_vlan_replay() should not do anything for the new drivers on which we call it. The existing drivers where there was even a slight possibility for there to exist a VLAN on a bridge port before they join it are already guarded against this: mlxsw and prestera deny joining LAG interfaces that are members of a bridge. - br_fdb_replay() should now notify of local FDB entries, but I patched all drivers except DSA to ignore these new entries in commit 2c4eca3ef716 ("net: bridge: switchdev: include local flag in FDB notifications"). Driver authors can lift this restriction as they wish, and when they do, they can also opt into the FDB replay functionality. - br_mdb_replay() should fix a real issue which is described in commit 4f2673b3a2b6 ("net: bridge: add helper to replay port and host-joined mdb entries"). However most drivers do not offload the SWITCHDEV_OBJ_ID_HOST_MDB to see this issue: only cpsw and am65_cpsw offload this switchdev object, and I don't completely understand the way in which they offload this switchdev object anyway. So I'll leave it up to these drivers' respective maintainers to opt into br_mdb_replay(). So most of the drivers pass NULL notifier blocks for the replay helpers, except: - dpaa2-switch which was already acked/regression-tested with the helpers enabled (and there isn't much of a downside in having them) - ocelot which already had replay logic in "pull" mode - DSA which already had replay logic in "pull" mode An important observation is that the drivers which don't currently request bridge event replays don't even have the switchdev_bridge_port_{offload,unoffload} calls placed in proper places right now. This was done to avoid unnecessary rework for drivers which might never even add support for this. For driver writers who wish to add replay support, this can be used as a tentative placement guide: https://patchwork.kernel.org/project/netdevbpf/patch/20210720134655.892334-11-vladimir.oltean@nxp.com/ Cc: Vadym Kochan Cc: Taras Chornyi Cc: Ioana Ciornei Cc: Lars Povlsen Cc: Steen Hegelund Cc: UNGLinuxDriver@microchip.com Cc: Claudiu Manoil Cc: Alexandre Belloni Cc: Grygorii Strashko Signed-off-by: Vladimir Oltean Acked-by: Ioana Ciornei # dpaa2-switch Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 54 ++++++++++++++++------------------------------- 1 file changed, 18 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index ce413eca527e..bbf680093823 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -70,9 +70,6 @@ bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto); bool br_multicast_has_router_adjacent(struct net_device *dev, int proto); bool br_multicast_enabled(const struct net_device *dev); bool br_multicast_router(const struct net_device *dev); -int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, - const void *ctx, bool adding, struct notifier_block *nb, - struct netlink_ext_ack *extack); #else static inline int br_multicast_list_adjacent(struct net_device *dev, struct list_head *br_ip_list) @@ -104,13 +101,6 @@ static inline bool br_multicast_router(const struct net_device *dev) { return false; } -static inline int br_mdb_replay(const struct net_device *br_dev, - const struct net_device *dev, const void *ctx, - bool adding, struct notifier_block *nb, - struct netlink_ext_ack *extack) -{ - return -EOPNOTSUPP; -} #endif #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) @@ -120,9 +110,6 @@ int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid); int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto); int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo); -int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, - const void *ctx, bool adding, struct notifier_block *nb, - struct netlink_ext_ack *extack); #else static inline bool br_vlan_enabled(const struct net_device *dev) { @@ -149,14 +136,6 @@ static inline int br_vlan_get_info(const struct net_device *dev, u16 vid, { return -EINVAL; } - -static inline int br_vlan_replay(struct net_device *br_dev, - struct net_device *dev, const void *ctx, - bool adding, struct notifier_block *nb, - struct netlink_ext_ack *extack) -{ - return -EOPNOTSUPP; -} #endif #if IS_ENABLED(CONFIG_BRIDGE) @@ -167,8 +146,6 @@ void br_fdb_clear_offload(const struct net_device *dev, u16 vid); bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag); u8 br_port_get_stp_state(const struct net_device *dev); clock_t br_get_ageing_time(const struct net_device *br_dev); -int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev, - const void *ctx, bool adding, struct notifier_block *nb); #else static inline struct net_device * br_fdb_find_port(const struct net_device *br_dev, @@ -197,32 +174,37 @@ static inline clock_t br_get_ageing_time(const struct net_device *br_dev) { return 0; } - -static inline int br_fdb_replay(const struct net_device *br_dev, - const struct net_device *dev, const void *ctx, - bool adding, struct notifier_block *nb) -{ - return -EOPNOTSUPP; -} #endif #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_NET_SWITCHDEV) int switchdev_bridge_port_offload(struct net_device *brport_dev, - struct net_device *dev, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, struct netlink_ext_ack *extack); -void switchdev_bridge_port_unoffload(struct net_device *brport_dev); +void switchdev_bridge_port_unoffload(struct net_device *brport_dev, + const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb); #else -static inline int switchdev_bridge_port_offload(struct net_device *brport_dev, - struct net_device *dev, - struct netlink_ext_ack *extack) +static inline int +switchdev_bridge_port_offload(struct net_device *brport_dev, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack) { return -EINVAL; } -static inline void switchdev_bridge_port_unoffload(struct net_device *brport_dev) +static inline void +switchdev_bridge_port_unoffload(struct net_device *brport_dev, + const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb) { } #endif -- cgit v1.2.3 From 1a33b18b3bd9748c9c712a23e788bf1f1c4a7025 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 22 Jul 2021 16:28:58 +0200 Subject: compat: make linux/compat.h available everywhere Parts of linux/compat.h are under an #ifdef, but we end up using more of those over time, moving things around bit by bit. To get it over with once and for all, make all of this file uncondititonal now so it can be accessed everywhere. There are only a few types left that are in asm/compat.h but not yet in the asm-generic version, so add those in the process. This requires providing a few more types in asm-generic/compat.h that were not already there. The only tricky one is compat_sigset_t, which needs a little help on 32-bit architectures and for x86. Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/compat.h | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index c270124e4402..8e0598c7d1d1 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -20,11 +20,8 @@ #include #include - -#ifdef CONFIG_COMPAT #include #include -#endif #ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER /* @@ -95,8 +92,6 @@ struct compat_iovec { compat_size_t iov_len; }; -#ifdef CONFIG_COMPAT - #ifndef compat_user_stack_pointer #define compat_user_stack_pointer() current_user_stack_pointer() #endif @@ -131,9 +126,11 @@ struct compat_tms { #define _COMPAT_NSIG_WORDS (_COMPAT_NSIG / _COMPAT_NSIG_BPW) +#ifndef compat_sigset_t typedef struct { compat_sigset_word sig[_COMPAT_NSIG_WORDS]; } compat_sigset_t; +#endif int set_compat_user_sigmask(const compat_sigset_t __user *umask, size_t sigsetsize); @@ -384,6 +381,7 @@ struct compat_keyctl_kdf_params { __u32 __spare[8]; }; +struct compat_stat; struct compat_statfs; struct compat_statfs64; struct compat_old_linux_dirent; @@ -428,7 +426,7 @@ put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, unsigned int size) { /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */ -#ifdef __BIG_ENDIAN +#if defined(__BIG_ENDIAN) && defined(CONFIG_64BIT) compat_sigset_t v; switch (_NSIG_WORDS) { case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3]; @@ -929,17 +927,6 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args); #endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */ - -/* - * For most but not all architectures, "am I in a compat syscall?" and - * "am I a compat task?" are the same question. For architectures on which - * they aren't the same question, arch code can override in_compat_syscall. - */ - -#ifndef in_compat_syscall -static inline bool in_compat_syscall(void) { return is_compat_task(); } -#endif - /** * ns_to_old_timeval32 - Compat version of ns_to_timeval * @nsec: the nanoseconds value to be converted @@ -969,6 +956,17 @@ int kcompat_sys_statfs64(const char __user * pathname, compat_size_t sz, int kcompat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user * buf); +#ifdef CONFIG_COMPAT + +/* + * For most but not all architectures, "am I in a compat syscall?" and + * "am I a compat task?" are the same question. For architectures on which + * they aren't the same question, arch code can override in_compat_syscall. + */ +#ifndef in_compat_syscall +static inline bool in_compat_syscall(void) { return is_compat_task(); } +#endif + #else /* !CONFIG_COMPAT */ #define is_compat_task() (0) -- cgit v1.2.3 From dd98d2895de6485c884a9cb42de69fed02826fa4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 22 Jul 2021 16:28:59 +0200 Subject: ethtool: improve compat ioctl handling The ethtool compat ioctl handling is hidden away in net/socket.c, which introduces a couple of minor oddities: - The implementation may end up diverging, as seen in the RXNFC extension in commit 84a1d9c48200 ("net: ethtool: extend RXNFC API to support RSS spreading of filter matches") that does not work in compat mode. - Most architectures do not need the compat handling at all because u64 and compat_u64 have the same alignment. - On x86, the conversion is done for both x32 and i386 user space, but it's actually wrong to do it for x32 and cannot work there. - On 32-bit Arm, it never worked for compat oabi user space, since that needs to do the same conversion but does not. - It would be nice to get rid of both compat_alloc_user_space() and copy_in_user() throughout the kernel. None of these actually seems to be a serious problem that real users are likely to encounter, but fixing all of them actually leads to code that is both shorter and more readable. Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/ethtool.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 232daaec56e4..4711b96dae0c 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -17,8 +17,6 @@ #include #include -#ifdef CONFIG_COMPAT - struct compat_ethtool_rx_flow_spec { u32 flow_type; union ethtool_flow_union h_u; @@ -38,8 +36,6 @@ struct compat_ethtool_rxnfc { u32 rule_locs[]; }; -#endif /* CONFIG_COMPAT */ - #include /** -- cgit v1.2.3 From b0e99d03778b2418aec20db99d97d19d25d198b6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 22 Jul 2021 16:29:01 +0200 Subject: net: socket: remove register_gifconf Since dynamic registration of the gifconf() helper is only used for IPv4, and this can not be in a loadable module, this can be simplified noticeably by turning it into a direct function call as a preparation for cleaning up the compat handling. Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 9 +++++++++ include/linux/netdevice.h | 8 -------- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 53aa0343bf69..67e042932681 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -178,6 +178,15 @@ static inline struct net_device *ip_dev_find(struct net *net, __be32 addr) int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b); int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *); +#ifdef CONFIG_INET +int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size); +#else +static inline int inet_gifconf(struct net_device *dev, char __user *buf, + int len, int size) +{ + return 0; +} +#endif void devinet_init(void); struct in_device *inetdev_by_index(struct net *, int); __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 42f6f866d5f3..6630a9f0b0f0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3289,14 +3289,6 @@ static inline bool dev_has_header(const struct net_device *dev) return dev->header_ops && dev->header_ops->create; } -typedef int gifconf_func_t(struct net_device * dev, char __user * bufptr, - int len, int size); -int register_gifconf(unsigned int family, gifconf_func_t *gifconf); -static inline int unregister_gifconf(unsigned int family) -{ - return register_gifconf(family, NULL); -} - #ifdef CONFIG_NET_FLOW_LIMIT #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ struct sd_flow_limit { -- cgit v1.2.3 From 876f0bf9d0d5189dca9341c8e8e8686b09db8398 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 22 Jul 2021 16:29:02 +0200 Subject: net: socket: simplify dev_ifconf handling The dev_ifconf() calling conventions make compat handling more complicated than necessary, simplify this by moving the in_compat_syscall() check into the function. Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6630a9f0b0f0..da2c273c7e0a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4008,7 +4008,7 @@ void netdev_rx_handler_unregister(struct net_device *dev); bool dev_valid_name(const char *name); int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_copyout); -int dev_ifconf(struct net *net, struct ifconf *, int); +int dev_ifconf(struct net *net, struct ifconf __user *ifc); int dev_ethtool(struct net *net, struct ifreq *); unsigned int dev_get_flags(const struct net_device *); int __dev_change_flags(struct net_device *dev, unsigned int flags, -- cgit v1.2.3 From 29c4964822aad42c960d9edf67fb8209f1886baa Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 22 Jul 2021 16:29:03 +0200 Subject: net: socket: rework compat_ifreq_ioctl() compat_ifreq_ioctl() is one of the last users of copy_in_user() and compat_alloc_user_space(), as it attempts to convert the 'struct ifreq' arguments from 32-bit to 64-bit format as used by dev_ioctl() and a couple of socket family specific interpretations. The current implementation works correctly when calling dev_ioctl(), inet_ioctl(), ieee802154_sock_ioctl(), atalk_ioctl(), qrtr_ioctl() and packet_ioctl(). The ioctl handlers for x25, netrom, rose and x25 do not interpret the arguments and only block the corresponding commands, so they do not care. For af_inet6 and af_decnet however, the compat conversion is slightly incorrect, as it will copy more data than the native handler accesses, both of them use a structure that is shorter than ifreq. Replace the copy_in_user() conversion with a pair of accessor functions to read and write the ifreq data in place with the correct length where needed, while leaving the other ones to copy the (already compatible) structures directly. Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index da2c273c7e0a..c871dc223dfa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4006,6 +4006,8 @@ int netdev_rx_handler_register(struct net_device *dev, void netdev_rx_handler_unregister(struct net_device *dev); bool dev_valid_name(const char *name); +int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg); +int put_user_ifreq(struct ifreq *ifr, void __user *arg); int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_copyout); int dev_ifconf(struct net *net, struct ifconf __user *ifc); -- cgit v1.2.3 From 472111920f1c5fbe103022a4b05bfb37128a2a29 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Thu, 22 Jul 2021 18:55:38 +0300 Subject: net: bridge: switchdev: allow the TX data plane forwarding to be offloaded Allow switchdevs to forward frames from the CPU in accordance with the bridge configuration in the same way as is done between bridge ports. This means that the bridge will only send a single skb towards one of the ports under the switchdev's control, and expects the driver to deliver the packet to all eligible ports in its domain. Primarily this improves the performance of multicast flows with multiple subscribers, as it allows the hardware to perform the frame replication. The basic flow between the driver and the bridge is as follows: - When joining a bridge port, the switchdev driver calls switchdev_bridge_port_offload() with tx_fwd_offload = true. - The bridge sends offloadable skbs to one of the ports under the switchdev's control using skb->offload_fwd_mark = true. - The switchdev driver checks the skb->offload_fwd_mark field and lets its FDB lookup select the destination port mask for this packet. v1->v2: - convert br_input_skb_cb::fwd_hwdoms to a plain unsigned long - introduce a static key "br_switchdev_fwd_offload_used" to minimize the impact of the newly introduced feature on all the setups which don't have hardware that can make use of it - introduce a check for nbp->flags & BR_FWD_OFFLOAD to optimize cache line access - reorder nbp_switchdev_frame_mark_accel() and br_handle_vlan() in __br_forward() - do not strip VLAN on egress if forwarding offload on VLAN-aware bridge is being used - propagate errors from .ndo_dfwd_add_station() if not EOPNOTSUPP v2->v3: - replace the solution based on .ndo_dfwd_add_station with a solution based on switchdev_bridge_port_offload - rename BR_FWD_OFFLOAD to BR_TX_FWD_OFFLOAD v3->v4: rebase v4->v5: - make sure the static key is decremented on bridge port unoffload - more function and variable renaming and comments for them: br_switchdev_fwd_offload_used to br_switchdev_tx_fwd_offload br_switchdev_accels_skb to br_switchdev_frame_uses_tx_fwd_offload nbp_switchdev_frame_mark_tx_fwd to nbp_switchdev_frame_mark_tx_fwd_to_hwdom nbp_switchdev_frame_mark_accel to nbp_switchdev_frame_mark_tx_fwd_offload fwd_accel to tx_fwd_offload Signed-off-by: Tobias Waldekranz Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index bbf680093823..f0b4ffbd8582 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -57,6 +57,7 @@ struct br_ip_list { #define BR_MRP_AWARE BIT(17) #define BR_MRP_LOST_CONT BIT(18) #define BR_MRP_LOST_IN_CONT BIT(19) +#define BR_TX_FWD_OFFLOAD BIT(20) #define BR_DEFAULT_AGEING_TIME (300 * HZ) @@ -182,6 +183,7 @@ int switchdev_bridge_port_offload(struct net_device *brport_dev, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, + bool tx_fwd_offload, struct netlink_ext_ack *extack); void switchdev_bridge_port_unoffload(struct net_device *brport_dev, const void *ctx, @@ -195,6 +197,7 @@ switchdev_bridge_port_offload(struct net_device *brport_dev, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, + bool tx_fwd_offload, struct netlink_ext_ack *extack) { return -EINVAL; -- cgit v1.2.3 From 3cee6fb8e69ecd79be891c89a94974c48a25a437 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Jul 2021 13:06:19 -0700 Subject: bpf: tcp: Support bpf_(get|set)sockopt in bpf tcp iter This patch allows bpf tcp iter to call bpf_(get|set)sockopt. To allow a specific bpf iter (tcp here) to call a set of helpers, get_func_proto function pointer is added to bpf_iter_reg. The bpf iter is a tracing prog which currently requires CAP_PERFMON or CAP_SYS_ADMIN, so this patch does not impose other capability checks for bpf_(get|set)sockopt. Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210701200619.1036715-1-kafai@fb.com --- include/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 978ebd16ae60..c8cc09013210 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1442,6 +1442,9 @@ typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux, struct seq_file *seq); typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info); +typedef const struct bpf_func_proto * +(*bpf_iter_get_func_proto_t)(enum bpf_func_id func_id, + const struct bpf_prog *prog); enum bpf_iter_feature { BPF_ITER_RESCHED = BIT(0), @@ -1454,6 +1457,7 @@ struct bpf_iter_reg { bpf_iter_detach_target_t detach_target; bpf_iter_show_fdinfo_t show_fdinfo; bpf_iter_fill_link_info_t fill_link_info; + bpf_iter_get_func_proto_t get_func_proto; u32 ctx_arg_info_size; u32 feature; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; @@ -1476,6 +1480,8 @@ struct bpf_iter__bpf_map_elem { int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info); bool bpf_iter_prog_supported(struct bpf_prog *prog); +const struct bpf_func_proto * +bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog); int bpf_iter_new_fd(struct bpf_link *link); bool bpf_link_is_iter(struct bpf_link *link); @@ -2050,6 +2056,8 @@ extern const struct bpf_func_proto bpf_task_storage_get_proto; extern const struct bpf_func_proto bpf_task_storage_delete_proto; extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; +extern const struct bpf_func_proto bpf_sk_setsockopt_proto; +extern const struct bpf_func_proto bpf_sk_getsockopt_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); -- cgit v1.2.3 From 616d5769345528b989294a242a5906b157a92837 Mon Sep 17 00:00:00 2001 From: Tal Gilboa Date: Sun, 18 Jul 2021 14:54:13 +0300 Subject: IB/mlx5: Rename is_apu_thread_cq function to is_apu_cq is_apu_thread_cq() used to detect CQs which are attached to APU threads. This was extended to support other elements as well, so the function was renamed to is_apu_cq(). c_eqn_or_apu_element was extended from 8 bits to 32 bits, which wan't reflected when the APU support was first introduced. Acked-by: Michael S. Tsirkin # vdpa Signed-off-by: Tal Gilboa Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3dd6641e942c..0b413f365699 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -3919,7 +3919,7 @@ struct mlx5_ifc_cqc_bits { u8 status[0x4]; u8 reserved_at_4[0x2]; u8 dbr_umem_valid[0x1]; - u8 apu_thread_cq[0x1]; + u8 apu_cq[0x1]; u8 cqe_sz[0x3]; u8 cc[0x1]; u8 reserved_at_c[0x1]; @@ -3945,8 +3945,7 @@ struct mlx5_ifc_cqc_bits { u8 cq_period[0xc]; u8 cq_max_count[0x10]; - u8 reserved_at_a0[0x18]; - u8 c_eqn[0x8]; + u8 c_eqn_or_apu_element[0x20]; u8 reserved_at_c0[0x3]; u8 log_page_size[0x5]; -- cgit v1.2.3 From c757096ea1033c46ab768709847f7776b7e92a92 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Wed, 9 Oct 2019 06:41:08 +0200 Subject: can: rx-offload: add skb queue for use during ISR Adding a skb to the skb_queue in rx-offload requires to take a lock. This commit avoids this by adding an unlocked skb queue that is appended at the end of the ISR. Having one lock at the end of the ISR should be OK as the HW is empty, not about to overflow. Link: https://lore.kernel.org/r/20210724204745.736053-2-mkl@pengutronix.de Tested-by: Oleksij Rempel Co-developed-by: Kurt Van Dijck Signed-off-by: Kurt Van Dijck Signed-off-by: Marc Kleine-Budde --- include/linux/can/rx-offload.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h index 40882df7105e..d71c938e17d0 100644 --- a/include/linux/can/rx-offload.h +++ b/include/linux/can/rx-offload.h @@ -20,6 +20,7 @@ struct can_rx_offload { bool drop); struct sk_buff_head skb_queue; + struct sk_buff_head skb_irq_queue; u32 skb_queue_len_max; unsigned int mb_first; @@ -48,6 +49,7 @@ unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload, unsigned int *frame_len_ptr); int can_rx_offload_queue_tail(struct can_rx_offload *offload, struct sk_buff *skb); +void can_rx_offload_irq_finish(struct can_rx_offload *offload); void can_rx_offload_del(struct can_rx_offload *offload); void can_rx_offload_enable(struct can_rx_offload *offload); -- cgit v1.2.3 From 1e0d8e507ea42dd37f52636db300de7ea7118012 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 7 May 2021 17:58:30 +0200 Subject: can: rx-offload: can_rx_offload_irq_finish(): directly call napi_schedule() Instead of calling can_rx_offload_schedule() call napi_schedule() directly. As this was the last use of can_rx_offload_schedule() remove this helper function. Link: https://lore.kernel.org/r/20210724204745.736053-3-mkl@pengutronix.de Tested-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- include/linux/can/rx-offload.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h index d71c938e17d0..516f64df0ebc 100644 --- a/include/linux/can/rx-offload.h +++ b/include/linux/can/rx-offload.h @@ -53,11 +53,6 @@ void can_rx_offload_irq_finish(struct can_rx_offload *offload); void can_rx_offload_del(struct can_rx_offload *offload); void can_rx_offload_enable(struct can_rx_offload *offload); -static inline void can_rx_offload_schedule(struct can_rx_offload *offload) -{ - napi_schedule(&offload->napi); -} - static inline void can_rx_offload_disable(struct can_rx_offload *offload) { napi_disable(&offload->napi); -- cgit v1.2.3 From 30bfec4fec5902731c8823f51c5332e6f2b2312a Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 10 May 2021 22:51:39 +0200 Subject: can: rx-offload: can_rx_offload_threaded_irq_finish(): add new function to be called from threaded interrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After reading all CAN frames from the controller in the IRQ handler and storing them into a skb_queue, the driver calls napi_schedule(). In the napi poll function the skb from the skb_queue are then pushed into the networking stack. However if napi_schedule() is called from a threaded IRQ handler this triggers the following error: | NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #08!!! To avoid this, create a new rx-offload function (can_rx_offload_threaded_irq_finish()) with a call to local_bh_disable()/local_bh_enable() around the napi_schedule() call. Convert all drivers that call can_rx_offload_irq_finish() from threaded IRQ context to can_rx_offload_threaded_irq_finish(). Link: https://lore.kernel.org/r/20210724204745.736053-4-mkl@pengutronix.de Suggested-by: Daniel Glöckner Tested-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- include/linux/can/rx-offload.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h index 516f64df0ebc..c11477620403 100644 --- a/include/linux/can/rx-offload.h +++ b/include/linux/can/rx-offload.h @@ -50,6 +50,7 @@ unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload, int can_rx_offload_queue_tail(struct can_rx_offload *offload, struct sk_buff *skb); void can_rx_offload_irq_finish(struct can_rx_offload *offload); +void can_rx_offload_threaded_irq_finish(struct can_rx_offload *offload); void can_rx_offload_del(struct can_rx_offload *offload); void can_rx_offload_enable(struct can_rx_offload *offload); -- cgit v1.2.3 From 8345a330738149389dc8883573c9264965922e08 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Wed, 16 Jun 2021 11:55:26 +0200 Subject: can: bittiming: fix documentation for struct can_tdc This patch fixes a typo in the documentation for struct can_tdc::tdcv. The number "0" refers to automatic mode not the letter "O". Further two grammar errors in the documentation for struct can_tdc are fixed. First grammar error: add a missing third person 's'. Second grammar error: replace "such as" by "such that". The intent is to give a condition, not an example. Fixes: 289ea9e4ae59 ("can: add new CAN FD bittiming parameters: Transmitter Delay Compensation (TDC)") Link: https://lore.kernel.org/r/20210616095922.2430415-1-mkl@pengutronix.de Link: https://lore.kernel.org/r/20210616124057.60723-1-mailhol.vincent@wanadoo.fr Co-developed-by: Vincent Mailhol Signed-off-by: Vincent Mailhol Acked-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- include/linux/can/bittiming.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index ae7a3411167c..9de6e9053e34 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -37,7 +37,7 @@ * quanta, from when the bit is sent on the TX pin to when it is * received on the RX pin of the transmitter. Possible options: * - * O: automatic mode. The controller dynamically measure @tdcv + * 0: automatic mode. The controller dynamically measures @tdcv * for each transmitted CAN FD frame. * * Other values: manual mode. Use the fixed provided value. @@ -45,7 +45,7 @@ * @tdco: Transmitter Delay Compensation Offset. Offset value, in time * quanta, defining the distance between the start of the bit * reception on the RX pin of the transceiver and the SSP - * position such as SSP = @tdcv + @tdco. + * position such that SSP = @tdcv + @tdco. * * If @tdco is zero, then TDC is disabled and both @tdcv and * @tdcf should be ignored. -- cgit v1.2.3 From 896e7f3e7424d6cc1436172740aa76ebb2c1b248 Mon Sep 17 00:00:00 2001 From: Angelo Dureghello Date: Fri, 2 Jul 2021 11:48:37 +0200 Subject: can: flexcan: add platform data header Add platform data header for flexcan. Link: https://lore.kernel.org/r/20210702094841.327679-1-angelo@kernel-space.org Signed-off-by: Angelo Dureghello Signed-off-by: Marc Kleine-Budde --- include/linux/can/platform/flexcan.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 include/linux/can/platform/flexcan.h (limited to 'include/linux') diff --git a/include/linux/can/platform/flexcan.h b/include/linux/can/platform/flexcan.h new file mode 100644 index 000000000000..1b536fb999de --- /dev/null +++ b/include/linux/can/platform/flexcan.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Angelo Dureghello + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _CAN_PLATFORM_FLEXCAN_H +#define _CAN_PLATFORM_FLEXCAN_H + +struct flexcan_platform_data { + u32 clock_frequency; + u8 clk_src; +}; + +#endif /* _CAN_PLATFORM_FLEXCAN_H */ -- cgit v1.2.3 From 26ab7b384525ccfa678c518577f7f0d841209c8b Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Fri, 23 Apr 2021 20:34:48 +0300 Subject: net/mlx5e: Block LRO if firmware asks for tunneled LRO This commit does a cleanup in LRO configuration. LRO is a parameter of an RQ, but its state is changed by modifying a TIR related to the RQ. The current status: LRO for tunneled packets is not supported in the driver, inner TIRs may enable LRO on creation, but LRO status of inner TIRs isn't changed in mlx5e_modify_tirs_lro(). This is inconsistent, but as long as the firmware doesn't declare support for tunneled LRO, it works, because the same RQs are shared between the inner and outer TIRs. This commit does two fixes: 1. If the firmware has the tunneled LRO capability, LRO is blocked altogether, because it's not possible to block it for inner TIRs only, when the same RQs are shared between inner and outer TIRs, and the driver won't be able to handle tunneled LRO traffic. 2. mlx5e_modify_tirs_lro() is patched to modify LRO state for all TIRs, including inner ones, because all TIRs related to an RQ should agree on their LRO state. Fixes: 7b3722fa9ef6 ("net/mlx5e: Support RSS for GRE tunneled packets") Signed-off-by: Maxim Mikityanskiy Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index b0009aa3647f..6bbae0c3bc0b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -921,7 +921,8 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 scatter_fcs[0x1]; u8 enhanced_multi_pkt_send_wqe[0x1]; u8 tunnel_lso_const_out_ip_id[0x1]; - u8 reserved_at_1c[0x2]; + u8 tunnel_lro_gre[0x1]; + u8 tunnel_lro_vxlan[0x1]; u8 tunnel_stateless_gre[0x1]; u8 tunnel_stateless_vxlan[0x1]; -- cgit v1.2.3 From ee80dd2e89ecce9c5dd6f556b8f581c9e1cbb605 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 26 Jul 2021 19:55:29 +0300 Subject: net: bridge: add a helper for retrieving port VLANs from the data path Introduce a brother of br_vlan_get_info() which is protected by the RCU mechanism, as opposed to br_vlan_get_info() which relies on taking the write-side rtnl_mutex. This is needed for drivers which need to find out whether a bridge port has a VLAN configured or not. For example, certain DSA switches might not offer complete source port identification to the CPU on RX, just the VLAN in which the packet was received. Based on this VLAN, we cannot set an accurate skb->dev ingress port, but at least we can configure one that behaves the same as the correct one would (this is possible because DSA sets skb->offload_fwd_mark = 1). When we look at the bridge RX handler (br_handle_frame), we see that what matters regarding skb->dev is the VLAN ID and the port STP state. So we need to select an skb->dev that has the same bridge VLAN as the packet we're receiving, and is in the LEARNING or FORWARDING STP state. The latter is easy, but for the former, we should somehow keep a shadow list of the bridge VLANs on each port, and a lookup table between VLAN ID and the 'designated port for imprecise RX'. That is rather complicated to keep in sync properly (the designated port per VLAN needs to be updated on the addition and removal of a VLAN, as well as on the join/leave events of the bridge on that port). So, to avoid all that complexity, let's just iterate through our finite number of ports and ask the bridge, for each packet: "do you have this VLAN configured on this port?". Cc: Roopa Prabhu Cc: Nikolay Aleksandrov Cc: Ido Schimmel Cc: Jiri Pirko Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index f0b4ffbd8582..b73b4ff749e1 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -111,6 +111,8 @@ int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid); int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto); int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo); +int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, + struct bridge_vlan_info *p_vinfo); #else static inline bool br_vlan_enabled(const struct net_device *dev) { @@ -137,6 +139,12 @@ static inline int br_vlan_get_info(const struct net_device *dev, u16 vid, { return -EINVAL; } + +static inline int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, + struct bridge_vlan_info *p_vinfo) +{ + return -EINVAL; +} #endif #if IS_ENABLED(CONFIG_BRIDGE) -- cgit v1.2.3 From b6ad86e6ad6c46e52cac218e62613c6c47cf7fa0 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 26 Jul 2021 19:55:35 +0300 Subject: net: dsa: sja1105: add bridge TX data plane offload based on tag_8021q The main desire for having this feature in sja1105 is to support network stack termination for traffic coming from a VLAN-aware bridge. For sja1105, offloading the bridge data plane means sending packets as-is, with the proper VLAN tag, to the chip. The chip will look up its FDB and forward them to the correct destination port. But we support bridge data plane offload even for VLAN-unaware bridges, and the implementation there is different. In fact, VLAN-unaware bridging is governed by tag_8021q, so it makes sense to have the .bridge_fwd_offload_add() implementation fully within tag_8021q. The key difference is that we only support 1 VLAN-aware bridge, but we support multiple VLAN-unaware bridges. So we need to make sure that the forwarding domain is not crossed by packets injected from the stack. For this, we introduce the concept of a tag_8021q TX VLAN for bridge forwarding offload. As opposed to the regular TX VLANs which contain only 2 ports (the user port and the CPU port), a bridge data plane TX VLAN is "multicast" (or "imprecise"): it contains all the ports that are part of a certain bridge, and the hardware will select where the packet goes within this "imprecise" forwarding domain. Each VLAN-unaware bridge has its own "imprecise" TX VLAN, so we make use of the unique "bridge_num" provided by DSA for the data plane offload. We use the same 3 bits from the tag_8021q VLAN ID format to encode this bridge number. Note that these 3 bit positions have been used before for sub-VLANs in best-effort VLAN filtering mode. The difference is that for best-effort, the sub-VLANs were only valid on RX (and it was documented that the sub-VLAN field needed to be transmitted as zero). Whereas for the bridge data plane offload, these 3 bits are only valid on TX. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index ec5abfcdefd1..c7fa4a3498fe 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -35,6 +35,16 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id); +int dsa_tag_8021q_bridge_tx_fwd_offload(struct dsa_switch *ds, int port, + struct net_device *br, + int bridge_num); + +void dsa_tag_8021q_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port, + struct net_device *br, + int bridge_num); + +u16 dsa_8021q_bridge_tx_fwd_offload_vid(int bridge_num); + u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port); u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port); -- cgit v1.2.3 From b9067f5dc4a07c8e24e01a1b277c6722d91be39e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 27 Jul 2021 15:44:47 +0200 Subject: net: split out SIOCDEVPRIVATE handling from dev_ioctl SIOCDEVPRIVATE ioctl commands are mainly used in really old drivers, and they have a number of problems: - They hide behind the normal .ndo_do_ioctl function that is also used for other things in modern drivers, so it's hard to spot a driver that actually uses one of these - Since drivers use a number different calling conventions, it is impossible to support compat mode for them in a generic way. - With all drivers using the same 16 commands codes, there is no way to introspect the data being passed through things like strace. Add a new net_device_ops callback pointer, to address the first two of these. Separating them from .ndo_do_ioctl makes it easy to grep for drivers with a .ndo_siocdevprivate callback, and the unwieldy name hopefully makes it easier to spot in code review. By passing the ifreq structure and the ifr_data pointer separately, it is no longer necessary to overload these, and the driver can use either one for a given command. Cc: Cong Wang Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c871dc223dfa..670e1a8e5928 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1361,6 +1361,9 @@ struct net_device_ops { int (*ndo_validate_addr)(struct net_device *dev); int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); + int (*ndo_siocdevprivate)(struct net_device *dev, + struct ifreq *ifr, + void __user *data, int cmd); int (*ndo_set_config)(struct net_device *dev, struct ifmap *map); int (*ndo_change_mtu)(struct net_device *dev, -- cgit v1.2.3 From 25ec92fbdd23a0a2bfd2bdf489e60ea4f0ae46d1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 27 Jul 2021 15:45:04 +0200 Subject: hamradio: use ndo_siocdevprivate hamradio uses a set of private ioctls that do seem to work correctly in compat mode, as they only rely on the ifr_data pointer. Move them over to the ndo_siocdevprivate callback as a cleanup. Cc: Thomas Sailer Cc: Joerg Reuter Cc: Jean-Paul Roubelat Cc: linux-hams@vger.kernel.org Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/linux/hdlcdrv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hdlcdrv.h b/include/linux/hdlcdrv.h index d4d633a49d36..5d70c3f98f5b 100644 --- a/include/linux/hdlcdrv.h +++ b/include/linux/hdlcdrv.h @@ -79,7 +79,7 @@ struct hdlcdrv_ops { */ int (*open)(struct net_device *); int (*close)(struct net_device *); - int (*ioctl)(struct net_device *, struct ifreq *, + int (*ioctl)(struct net_device *, void __user *, struct hdlcdrv_ioctl *, int); }; -- cgit v1.2.3 From a554bf96b49db4c208e305ae92546422e9489380 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 27 Jul 2021 15:45:12 +0200 Subject: dev_ioctl: pass SIOCDEVPRIVATE data separately The compat handlers for SIOCDEVPRIVATE are incorrect for any driver that passes data as part of struct ifreq rather than as an ifr_data pointer, or that passes data back this way, since the compat_ifr_data_ioctl() helper overwrites the ifr_data pointer and does not copy anything back out. Since all drivers using devprivate commands are now converted to the new .ndo_siocdevprivate callback, fix this by adding the missing piece and passing the pointer separately the whole way. This further unifies the native and compat logic for socket ioctls, as the new code now passes the correct pointer as well as the correct data for both native and compat ioctls. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 670e1a8e5928..658d8cf57342 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4012,9 +4012,9 @@ bool dev_valid_name(const char *name); int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg); int put_user_ifreq(struct ifreq *ifr, void __user *arg); int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, - bool *need_copyout); + void __user *data, bool *need_copyout); int dev_ifconf(struct net *net, struct ifconf __user *ifc); -int dev_ethtool(struct net *net, struct ifreq *); +int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata); unsigned int dev_get_flags(const struct net_device *); int __dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); -- cgit v1.2.3 From a76053707dbf0dc020a73b4d90cd952409ef3691 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 27 Jul 2021 15:45:13 +0200 Subject: dev_ioctl: split out ndo_eth_ioctl Most users of ndo_do_ioctl are ethernet drivers that implement the MII commands SIOCGMIIPHY/SIOCGMIIREG/SIOCSMIIREG, or hardware timestamping with SIOCSHWTSTAMP/SIOCGHWTSTAMP. Separate these from the few drivers that use ndo_do_ioctl to implement SIOCBOND, SIOCBR and SIOCWANDEV commands. This is a purely cosmetic change intended to help readers find their way through the implementation. Cc: Doug Ledford Cc: Jason Gunthorpe Cc: Jay Vosburgh Cc: Veaceslav Falico Cc: Andy Gospodarek Cc: Andrew Lunn Cc: Vivien Didelot Cc: Florian Fainelli Cc: Vladimir Oltean Cc: Leon Romanovsky Cc: linux-rdma@vger.kernel.org Signed-off-by: Arnd Bergmann Acked-by: Jason Gunthorpe Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 658d8cf57342..b6e062a3b0d4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1090,6 +1090,10 @@ struct netdev_net_notifier { * the generic interface code. If not defined ioctls return * not supported error code. * + * * int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); + * Called for ethernet specific ioctls: SIOCGMIIPHY, SIOCGMIIREG, + * SIOCSMIIREG, SIOCSHWTSTAMP and SIOCGHWTSTAMP. + * * int (*ndo_set_config)(struct net_device *dev, struct ifmap *map); * Used to set network devices bus interface parameters. This interface * is retained for legacy reasons; new devices should use the bus @@ -1361,6 +1365,8 @@ struct net_device_ops { int (*ndo_validate_addr)(struct net_device *dev); int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); + int (*ndo_eth_ioctl)(struct net_device *dev, + struct ifreq *ifr, int cmd); int (*ndo_siocdevprivate)(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd); -- cgit v1.2.3 From ad7eab2ab014748b062507b7ac69f8e856057717 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 27 Jul 2021 15:45:14 +0200 Subject: net: split out ndo_siowandev ioctl In order to further reduce the scope of ndo_do_ioctl(), move out the SIOCWANDEV handling into a new network device operation function. Adjust the prototype to only pass the if_settings sub-structure in place of the ifreq, and remove the redundant 'cmd' argument in the process. Cc: Krzysztof Halasa Cc: "Jan \"Yenya\" Kasprzak" Cc: Kevin Curtis Cc: Zhao Qiang Cc: Martin Schiller Cc: Jiri Slaby Cc: linux-x25@vger.kernel.org Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/linux/hdlc.h | 4 ++-- include/linux/netdevice.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hdlc.h b/include/linux/hdlc.h index cacc4dd27794..630a388035f1 100644 --- a/include/linux/hdlc.h +++ b/include/linux/hdlc.h @@ -22,7 +22,7 @@ struct hdlc_proto { void (*start)(struct net_device *dev); /* if open & DCD */ void (*stop)(struct net_device *dev); /* if open & !DCD */ void (*detach)(struct net_device *dev); - int (*ioctl)(struct net_device *dev, struct ifreq *ifr); + int (*ioctl)(struct net_device *dev, struct if_settings *ifs); __be16 (*type_trans)(struct sk_buff *skb, struct net_device *dev); int (*netif_rx)(struct sk_buff *skb); netdev_tx_t (*xmit)(struct sk_buff *skb, struct net_device *dev); @@ -54,7 +54,7 @@ typedef struct hdlc_device { /* Exported from hdlc module */ /* Called by hardware driver when a user requests HDLC service */ -int hdlc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); +int hdlc_ioctl(struct net_device *dev, struct if_settings *ifs); /* Must be used by hardware driver on module startup/exit */ #define register_hdlc_device(dev) register_netdev(dev) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b6e062a3b0d4..cc11382f76a3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1367,6 +1367,8 @@ struct net_device_ops { struct ifreq *ifr, int cmd); int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); + int (*ndo_siocwandev)(struct net_device *dev, + struct if_settings *ifs); int (*ndo_siocdevprivate)(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd); -- cgit v1.2.3 From ad2f99aedf8fa77f3ae647153284fa63c43d3055 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 27 Jul 2021 15:45:16 +0200 Subject: net: bridge: move bridge ioctls out of .ndo_do_ioctl Working towards obsoleting the .ndo_do_ioctl operation entirely, stop passing the SIOCBRADDIF/SIOCBRDELIF device ioctl commands into this callback. My first attempt was to add another ndo_siocbr() callback, but as there is only a single driver that takes these commands and there is already a hook mechanism to call directly into this driver, extend this hook instead, and use it for both the deviceless and the device specific ioctl commands. Cc: Roopa Prabhu Cc: Nikolay Aleksandrov Cc: bridge@lists.linux-foundation.org Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index b73b4ff749e1..21daed10322e 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -61,7 +61,12 @@ struct br_ip_list { #define BR_DEFAULT_AGEING_TIME (300 * HZ) -extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *)); +struct net_bridge; +void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br, + unsigned int cmd, struct ifreq *ifr, + void __user *uarg)); +int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd, + struct ifreq *ifr, void __user *uarg); #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING) int br_multicast_list_adjacent(struct net_device *dev, -- cgit v1.2.3 From 3d9d00bd1885afa6b2c766cf9bab7b54b1a951ed Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 27 Jul 2021 15:45:17 +0200 Subject: net: bonding: move ioctl handling to private ndo operation All other user triggered operations are gone from ndo_ioctl, so move the SIOCBOND family into a custom operation as well. The .ndo_ioctl() helper is no longer called by the dev_ioctl.c code now, but there are still a few definitions in obsolete wireless drivers as well as the appletalk and ieee802154 layers to call SIOCSIFADDR/SIOCGIFADDR helpers from inside the kernel. Cc: Jay Vosburgh Cc: Veaceslav Falico Cc: Andy Gospodarek Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/linux/netdevice.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cc11382f76a3..226bbee06730 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1086,9 +1086,14 @@ struct netdev_net_notifier { * Test if Media Access Control address is valid for the device. * * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); - * Called when a user requests an ioctl which can't be handled by - * the generic interface code. If not defined ioctls return - * not supported error code. + * Old-style ioctl entry point. This is used internally by the + * appletalk and ieee802154 subsystems but is no longer called by + * the device ioctl handler. + * + * int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd); + * Used by the bonding driver for its device specific ioctls: + * SIOCBONDENSLAVE, SIOCBONDRELEASE, SIOCBONDSETHWADDR, SIOCBONDCHANGEACTIVE, + * SIOCBONDSLAVEINFOQUERY, and SIOCBONDINFOQUERY * * * int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); * Called for ethernet specific ioctls: SIOCGMIIPHY, SIOCGMIIREG, @@ -1367,6 +1372,8 @@ struct net_device_ops { struct ifreq *ifr, int cmd); int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); + int (*ndo_siocbond)(struct net_device *dev, + struct ifreq *ifr, int cmd); int (*ndo_siocwandev)(struct net_device *dev, struct if_settings *ifs); int (*ndo_siocdevprivate)(struct net_device *dev, -- cgit v1.2.3 From 5fc88f93edf2f797f1aa63334cc6c86f9c15d585 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 28 Jul 2021 18:23:59 +0200 Subject: sk_buff: introduce 'slow_gro' flags The new flag tracks if any state field is set, so that GRO requires 'unusual'/slow prepare steps. Set such flag when a ct entry is attached to the skb, and never clear it. The new bit uses an existing hole into the sk_buff struct RFC -> v1: - use a single state bit, never clear it - avoid moving the _nfct field Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f19190820e63..3ff18300d210 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -689,6 +689,7 @@ typedef unsigned char *sk_buff_data_t; * CHECKSUM_UNNECESSARY (max 3) * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB + * @slow_gro: state present at GRO time, slower prepare step required * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS * @secmark: security marking @@ -870,6 +871,7 @@ struct sk_buff { #ifdef CONFIG_TLS_DEVICE __u8 decrypted:1; #endif + __u8 slow_gro:1; #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ @@ -4216,6 +4218,7 @@ static inline unsigned long skb_get_nfct(const struct sk_buff *skb) static inline void skb_set_nfct(struct sk_buff *skb, unsigned long nfct) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) + skb->slow_gro |= !!nfct; skb->_nfct = nfct; #endif } @@ -4375,6 +4378,7 @@ static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb_nfct(dst)); #endif + dst->slow_gro = src->slow_gro; __nf_copy(dst, src, true); } -- cgit v1.2.3 From 8a886b142bd03d36612747e9aefdf0282c8b02dd Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 28 Jul 2021 18:24:00 +0200 Subject: sk_buff: track dst status in slow_gro Similar to the previous patch, but covering the dst field: the slow_gro flag is additionally set when a dst is attached to the skb RFC -> v1: - use the existing flag instead of adding a new one Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3ff18300d210..b1e5bbfcc926 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -992,6 +992,7 @@ static inline struct dst_entry *skb_dst(const struct sk_buff *skb) */ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) { + skb->slow_gro |= !!dst; skb->_skb_refdst = (unsigned long)dst; } @@ -1008,6 +1009,7 @@ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + skb->slow_gro = !!dst; skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; } -- cgit v1.2.3 From bc49d8169aa72295104f1558830c568efb946315 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Thu, 29 Jul 2021 10:20:39 +0800 Subject: mctp: Add MCTP base Add basic Kconfig, an initial (empty) af_mctp source object, and {AF,PF}_MCTP definitions, and the required definitions for a new protocol type. Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller --- include/linux/socket.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 0d8e3dcb7f88..fd9ce51582d8 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -223,8 +223,11 @@ struct ucred { * reuses AF_INET address family */ #define AF_XDP 44 /* XDP sockets */ +#define AF_MCTP 45 /* Management component + * transport protocol + */ -#define AF_MAX 45 /* For now.. */ +#define AF_MAX 46 /* For now.. */ /* Protocol families, same as address families. */ #define PF_UNSPEC AF_UNSPEC @@ -274,6 +277,7 @@ struct ucred { #define PF_QIPCRTR AF_QIPCRTR #define PF_SMC AF_SMC #define PF_XDP AF_XDP +#define PF_MCTP AF_MCTP #define PF_MAX AF_MAX /* Maximum queue length specifiable by listen. */ -- cgit v1.2.3 From 583be982d93479ea3d85091b0fd0b01201ede87d Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Thu, 29 Jul 2021 10:20:44 +0800 Subject: mctp: Add device handling and netlink interface This change adds the infrastructure for managing MCTP netdevices; we add a pointer to the AF_MCTP-specific data to struct netdevice, and hook up the rtnetlink operations for adding and removing addresses. Includes changes from Matt Johnston . Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 226bbee06730..d63a94ecbf3b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1823,6 +1823,7 @@ enum netdev_ml_priv_type { * @ieee802154_ptr: IEEE 802.15.4 low-rate Wireless Personal Area Network * device struct * @mpls_ptr: mpls_dev struct pointer + * @mctp_ptr: MCTP specific data * * @dev_addr: Hw address (before bcast, * because most packets are unicast) @@ -2110,6 +2111,9 @@ struct net_device { #if IS_ENABLED(CONFIG_MPLS_ROUTING) struct mpls_dev __rcu *mpls_ptr; #endif +#if IS_ENABLED(CONFIG_MCTP) + struct mctp_dev __rcu *mctp_ptr; +#endif /* * Cache lines mostly used on receive path (including eth_type_trans()) -- cgit v1.2.3 From bc830525615df6b6b1793ac23750f32695903fd0 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Thu, 29 Jul 2021 15:48:54 +0800 Subject: net: netlink: Remove unused function lockdep_genl_is_held() and its caller arm not used now, just remove them. Signed-off-by: Yajun Deng Link: https://lore.kernel.org/r/20210729074854.8968-1-yajun.deng@linux.dev Signed-off-by: Jakub Kicinski --- include/linux/genetlink.h | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genetlink.h b/include/linux/genetlink.h index bc738504ab4a..c285968e437a 100644 --- a/include/linux/genetlink.h +++ b/include/linux/genetlink.h @@ -8,34 +8,11 @@ /* All generic netlink requests are serialized by a global lock. */ extern void genl_lock(void); extern void genl_unlock(void); -#ifdef CONFIG_LOCKDEP -extern bool lockdep_genl_is_held(void); -#endif /* for synchronisation between af_netlink and genetlink */ extern atomic_t genl_sk_destructing_cnt; extern wait_queue_head_t genl_sk_destructing_waitq; -/** - * rcu_dereference_genl - rcu_dereference with debug checking - * @p: The pointer to read, prior to dereferencing - * - * Do an rcu_dereference(p), but check caller either holds rcu_read_lock() - * or genl mutex. Note : Please prefer genl_dereference() or rcu_dereference() - */ -#define rcu_dereference_genl(p) \ - rcu_dereference_check(p, lockdep_genl_is_held()) - -/** - * genl_dereference - fetch RCU pointer when updates are prevented by genl mutex - * @p: The pointer to read, prior to dereferencing - * - * Return the value of the specified RCU-protected pointer, but omit - * the READ_ONCE(), because caller holds genl mutex. - */ -#define genl_dereference(p) \ - rcu_dereference_protected(p, lockdep_genl_is_held()) - #define MODULE_ALIAS_GENL_FAMILY(family)\ MODULE_ALIAS_NET_PF_PROTO_NAME(PF_NETLINK, NETLINK_GENERIC, "-family-" family) -- cgit v1.2.3 From a432934a30679c0e3c47b87f13e4901bc1a3fc03 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 30 Jul 2021 18:30:53 +0200 Subject: sk_buff: avoid potentially clearing 'slow_gro' field If skb_dst_set_noref() is invoked with a NULL dst, the 'slow_gro' field is cleared, too. That could lead to wrong behavior if the skb later enters the GRO stage. Fix the potential issue replacing preserving a non-zero value of the 'slow_gro' field. Additionally, fix a comment typo. Reported-by: Sabrina Dubroca Reported-by: Jakub Kicinski Fixes: 8a886b142bd0 ("sk_buff: track dst status in slow_gro") Signed-off-by: Paolo Abeni Link: https://lore.kernel.org/r/aa42529252dc8bb02bd42e8629427040d1058537.1627662501.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b1e5bbfcc926..2bcdc8cd38be 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1009,7 +1009,7 @@ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); - skb->slow_gro = !!dst; + skb->slow_gro |= !!dst; skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; } -- cgit v1.2.3 From 87663c39f898b18905499126548da61450628682 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 23 Jul 2021 15:18:01 +0200 Subject: netfilter: ebtables: do not hook tables by default If any of these modules is loaded, hooks get registered in all netns: Before: 'unshare -n nft list hooks' shows: family bridge hook prerouting { -2147483648 ebt_broute -0000000300 ebt_nat_hook } family bridge hook input { -0000000200 ebt_filter_hook } family bridge hook forward { -0000000200 ebt_filter_hook } family bridge hook output { +0000000100 ebt_nat_hook +0000000200 ebt_filter_hook } family bridge hook postrouting { +0000000300 ebt_nat_hook } This adds 'template 'tables' for ebtables. Each ebtable_foo registers the table as a template, with an init function that gets called once the first get/setsockopt call is made. ebtables core then searches the (per netns) list of tables. If no table is found, it searches the list of templates instead. If a template entry exists, the init function is called which will enable the table and register the hooks (so packets are diverted to the table). If no entry is found in the template list, request_module is called. After this, hook registration is delayed until the 'ebtables' (set/getsockopt) request is made for a given table and will only happen in the specific namespace. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge/ebtables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index a8178253ce53..10a01978bc0d 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -127,4 +127,6 @@ static inline bool ebt_invalid_target(int target) return (target < -NUM_STANDARD_TARGETS || target >= 0); } +int ebt_register_template(const struct ebt_table *t, int(*table_init)(struct net *net)); +void ebt_unregister_template(const struct ebt_table *t); #endif -- cgit v1.2.3 From 371cf74e78f3468016e8c7a159fc288a71d4dc86 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Fri, 2 Jul 2021 10:38:32 +0300 Subject: net/mlx5: Move TTC logic to fs_ttc Now that TTC logic is not dependent on mlx5e structs, move it to lib/fs_ttc.c so it could be used other part of the mlx5 driver. Signed-off-by: Maor Gottlieb Reviewed-by: Tariq Toukan Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 77746f7e35b8..0106c67e8ccb 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -38,6 +38,8 @@ #define MLX5_FS_DEFAULT_FLOW_TAG 0x0 +#define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) + enum { MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO = 1 << 16, MLX5_FLOW_CONTEXT_ACTION_ENCRYPT = 1 << 17, -- cgit v1.2.3 From f1260ff15a71b8fc122b2c9abd8a7abffb6e0168 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Mon, 2 Aug 2021 11:52:15 +0300 Subject: skbuff: introduce skb_expand_head() Like skb_realloc_headroom(), new helper increases headroom of specified skb. Unlike skb_realloc_headroom(), it does not allocate a new skb if possible; copies skb->sk on new skb when as needed and frees original skb in case of failures. This helps to simplify ip[6]_finish_output2() and a few other similar cases. Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2bcdc8cd38be..783cc2368bb1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1183,6 +1183,7 @@ static inline struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask); struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom); +struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom); struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom, int newtailroom, gfp_t priority); int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, -- cgit v1.2.3 From 5ea2f5ffde39251115ef9a566262fb9e52b91cb7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 3 Aug 2021 13:40:46 +0200 Subject: move netdev_boot_setup into Space.c This is now only used by a handful of old ISA drivers, and can be moved into the file they already all depend on. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/linux/netdevice.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d63a94ecbf3b..cd136499ec59 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -295,18 +295,6 @@ enum netdev_state_t { }; -/* - * This structure holds boot-time configured netdevice settings. They - * are then used in the device probing. - */ -struct netdev_boot_setup { - char name[IFNAMSIZ]; - struct ifmap map; -}; -#define NETDEV_BOOT_SETUP_MAX 8 - -int __init netdev_boot_setup(char *str); - struct gro_list { struct list_head list; int count; @@ -2939,7 +2927,6 @@ static inline struct net_device *first_net_device_rcu(struct net *net) } int netdev_boot_setup_check(struct net_device *dev); -unsigned long netdev_boot_base(const char *prefix, int unit); struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, const char *hwaddr); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type); -- cgit v1.2.3 From 27cfdadd687deca58146b415f60b23d185cb3532 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Tue, 3 Aug 2021 19:57:42 +0300 Subject: bus: fsl-mc: extend fsl_mc_get_endpoint() to pass interface ID In case of a switch DPAA2 object, the interface ID is also needed when querying for the object endpoint. Extend fsl_mc_get_endpoint() so that users can also pass the interface ID that are interested in. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- include/linux/fsl/mc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index 63b56aba925a..30ece3ae6df7 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -423,7 +423,8 @@ int __must_check fsl_mc_allocate_irqs(struct fsl_mc_device *mc_dev); void fsl_mc_free_irqs(struct fsl_mc_device *mc_dev); -struct fsl_mc_device *fsl_mc_get_endpoint(struct fsl_mc_device *mc_dev); +struct fsl_mc_device *fsl_mc_get_endpoint(struct fsl_mc_device *mc_dev, + u16 if_id); extern struct bus_type fsl_mc_bus_type; -- cgit v1.2.3 From 271e5b7d00aeff7c61fb6c5415d14dbedb783b68 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Aug 2021 06:05:26 -0700 Subject: net: add netif_set_real_num_queues() for device reconfig netif_set_real_num_rx_queues() and netif_set_real_num_tx_queues() can fail which breaks drivers trying to implement reconfiguration in a way that can't leave the device half-broken. In other words those functions are incompatible with prepare/commit approach. Luckily setting real number of queues can fail only if the number is increased, meaning that if we order operations correctly we can guarantee ending up with either new config (success), or the old one (on error). Provide a helper implementing such logic so that drivers don't have to duplicate it. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cd136499ec59..1b4d4509d04b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3916,6 +3916,8 @@ static inline int netif_set_real_num_rx_queues(struct net_device *dev, return 0; } #endif +int netif_set_real_num_queues(struct net_device *dev, + unsigned int txq, unsigned int rxq); static inline struct netdev_rx_queue * __netif_get_rx_queue(struct net_device *dev, unsigned int rxq) -- cgit v1.2.3 From 957e2235e5264c97cd6be8e2e17f2e11b41f2239 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 3 Aug 2021 23:34:08 +0300 Subject: net: make switchdev_bridge_port_{,unoffload} loosely coupled with the bridge With the introduction of explicit offloading API in switchdev in commit 2f5dc00f7a3e ("net: bridge: switchdev: let drivers inform which bridge ports are offloaded"), we started having Ethernet switch drivers calling directly into a function exported by net/bridge/br_switchdev.c, which is a function exported by the bridge driver. This means that drivers that did not have an explicit dependency on the bridge before, like cpsw and am65-cpsw, now do - otherwise it is not possible to call a symbol exported by a driver that can be built as module unless you are a module too. There was an attempt to solve the dependency issue in the form of commit b0e81817629a ("net: build all switchdev drivers as modules when the bridge is a module"). Grygorii Strashko, however, says about it: | In my opinion, the problem is a bit bigger here than just fixing the | build :( | | In case, of ^cpsw the switchdev mode is kinda optional and in many | cases (especially for testing purposes, NFS) the multi-mac mode is | still preferable mode. | | There were no such tight dependency between switchdev drivers and | bridge core before and switchdev serviced as independent, notification | based layer between them, so ^cpsw still can be "Y" and bridge can be | "M". Now for mostly every kernel build configuration the CONFIG_BRIDGE | will need to be set as "Y", or we will have to update drivers to | support build with BRIDGE=n and maintain separate builds for | networking vs non-networking testing. But is this enough? Wouldn't | it cause 'chain reaction' required to add more and more "Y" options | (like CONFIG_VLAN_8021Q)? | | PS. Just to be sure we on the same page - ARM builds will be forced | (with this patch) to have CONFIG_TI_CPSW_SWITCHDEV=m and so all our | automation testing will just fail with omap2plus_defconfig. In the light of this, it would be desirable for some configurations to avoid dependencies between switchdev drivers and the bridge, and have the switchdev mode as completely optional within the driver. Arnd Bergmann also tried to write a patch which better expressed the build time dependency for Ethernet switch drivers where the switchdev support is optional, like cpsw/am65-cpsw, and this made the drivers follow the bridge (compile as module if the bridge is a module) only if the optional switchdev support in the driver was enabled in the first place: https://patchwork.kernel.org/project/netdevbpf/patch/20210802144813.1152762-1-arnd@kernel.org/ but this still did not solve the fact that cpsw and am65-cpsw now must be built as modules when the bridge is a module - it just expressed correctly that optional dependency. But the new behavior is an apparent regression from Grygorii's perspective. So to support the use case where the Ethernet driver is built-in, NET_SWITCHDEV (a bool option) is enabled, and the bridge is a module, we need a framework that can handle the possible absence of the bridge from the running system, i.e. runtime bloatware as opposed to build-time bloatware. Luckily we already have this framework, since switchdev has been using it extensively. Events from the bridge side are transmitted to the driver side using notifier chains - this was originally done so that unrelated drivers could snoop for events emitted by the bridge towards ports that are implemented by other drivers (think of a switch driver with LAG offload that listens for switchdev events on a bonding/team interface that it offloads). There are also events which are transmitted from the driver side to the bridge side, which again are modeled using notifiers. SWITCHDEV_FDB_ADD_TO_BRIDGE is an example of this, and deals with notifying the bridge that a MAC address has been dynamically learned. So there is a precedent we can use for modeling the new framework. The difference compared to SWITCHDEV_FDB_ADD_TO_BRIDGE is that the work that the bridge needs to do when a port becomes offloaded is blocking in its nature: replay VLANs, MDBs etc. The calling context is indeed blocking (we are under rtnl_mutex), but the existing switchdev notification chain that the bridge is subscribed to is only the atomic one. So we need to subscribe the bridge to the blocking switchdev notification chain too. This patch: - keeps the driver-side perception of the switchdev_bridge_port_{,un}offload unchanged - moves the implementation of switchdev_bridge_port_{,un}offload from the bridge module into the switchdev module. - makes everybody that is subscribed to the switchdev blocking notifier chain "hear" offload & unoffload events - makes the bridge driver subscribe and handle those events - moves the bridge driver's handling of those events into 2 new functions called br_switchdev_port_{,un}offload. These functions contain in fact the core of the logic that was previously in switchdev_bridge_port_{,un}offload, just that now we go through an extra indirection layer to reach them. Unlike all the other switchdev notification structures, the structure used to carry the bridge port information, struct switchdev_notifier_brport_info, does not contain a "bool handled". This is because in the current usage pattern, we always know that a switchdev bridge port offloading event will be handled by the bridge, because the switchdev_bridge_port_offload() call was initiated by a NETDEV_CHANGEUPPER event in the first place, where info->upper_dev is a bridge. So if the bridge wasn't loaded, then the CHANGEUPPER event couldn't have happened. Signed-off-by: Vladimir Oltean Tested-by: Grygorii Strashko Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 35 ----------------------------------- 1 file changed, 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 21daed10322e..509e18c7e740 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -190,39 +190,4 @@ static inline clock_t br_get_ageing_time(const struct net_device *br_dev) } #endif -#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_NET_SWITCHDEV) - -int switchdev_bridge_port_offload(struct net_device *brport_dev, - struct net_device *dev, const void *ctx, - struct notifier_block *atomic_nb, - struct notifier_block *blocking_nb, - bool tx_fwd_offload, - struct netlink_ext_ack *extack); -void switchdev_bridge_port_unoffload(struct net_device *brport_dev, - const void *ctx, - struct notifier_block *atomic_nb, - struct notifier_block *blocking_nb); - -#else - -static inline int -switchdev_bridge_port_offload(struct net_device *brport_dev, - struct net_device *dev, const void *ctx, - struct notifier_block *atomic_nb, - struct notifier_block *blocking_nb, - bool tx_fwd_offload, - struct netlink_ext_ack *extack) -{ - return -EINVAL; -} - -static inline void -switchdev_bridge_port_unoffload(struct net_device *brport_dev, - const void *ctx, - struct notifier_block *atomic_nb, - struct notifier_block *blocking_nb) -{ -} -#endif - #endif -- cgit v1.2.3 From e6a1f7e0b0fe5997b896b793c70d12fc5ed06cdd Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 4 Aug 2021 16:18:50 -0500 Subject: net/ipv4/igmp: Use struct_size() helper Replace IP_SFLSIZE() with struct_size() helper in order to avoid any potential type mistakes or integer overflows that, in the worst scenario, could lead to heap overflows. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- include/linux/igmp.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 64ce8cd1cfaf..93c262ecbdc9 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -41,9 +41,6 @@ struct ip_sf_socklist { __be32 sl_addr[]; }; -#define IP_SFLSIZE(count) (sizeof(struct ip_sf_socklist) + \ - (count) * sizeof(__be32)) - #define IP_SFBLOCK 10 /* allocate this many at once */ /* ip_mc_socklist is real list now. Speed is not argument; -- cgit v1.2.3 From b37a466837393af72fe8bcb8f1436410f3f173f3 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Thu, 5 Aug 2021 19:54:34 +0800 Subject: netdevice: add the case if dev is NULL Add the case if dev is NULL in dev_{put, hold}, so the caller doesn't need to care whether dev is NULL or not. Signed-off-by: Yajun Deng Signed-off-by: David S. Miller --- include/linux/netdevice.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1b4d4509d04b..135c943699d0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4143,11 +4143,13 @@ void netdev_run_todo(void); */ static inline void dev_put(struct net_device *dev) { + if (dev) { #ifdef CONFIG_PCPU_DEV_REFCNT - this_cpu_dec(*dev->pcpu_refcnt); + this_cpu_dec(*dev->pcpu_refcnt); #else - refcount_dec(&dev->dev_refcnt); + refcount_dec(&dev->dev_refcnt); #endif + } } /** @@ -4158,11 +4160,13 @@ static inline void dev_put(struct net_device *dev) */ static inline void dev_hold(struct net_device *dev) { + if (dev) { #ifdef CONFIG_PCPU_DEV_REFCNT - this_cpu_inc(*dev->pcpu_refcnt); + this_cpu_inc(*dev->pcpu_refcnt); #else - refcount_inc(&dev->dev_refcnt); + refcount_inc(&dev->dev_refcnt); #endif + } } /* Carrier loss detection, dial on demand. The functions netif_carrier_on -- cgit v1.2.3 From 97a8a8c1f985baf13a3d0d252b787850330d2ea7 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 3 Aug 2021 16:19:46 -0700 Subject: net/mlx5: Return mdev from eswitch Export a function so users can retrieve the mellanox device that manages the eswitch from the eswitch device. Signed-off-by: Mark Bloch Reviewed-by: Mark Zhang Signed-off-by: Saeed Mahameed --- include/linux/mlx5/eswitch.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index bc7db2e059eb..c2a34ff85188 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -128,6 +128,7 @@ u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw, u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev); u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev); +struct mlx5_core_dev *mlx5_eswitch_get_core_dev(struct mlx5_eswitch *esw); #else /* CONFIG_MLX5_ESWITCH */ @@ -171,6 +172,11 @@ static inline u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev) return 0; } +static inline struct mlx5_core_dev *mlx5_eswitch_get_core_dev(struct mlx5_eswitch *esw) +{ + return NULL; +} + #endif /* CONFIG_MLX5_ESWITCH */ static inline bool is_mdev_switchdev_mode(struct mlx5_core_dev *dev) -- cgit v1.2.3 From af8c0e25f249abf8829f0cfa074b08d7398e3e38 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 3 Aug 2021 16:19:47 -0700 Subject: net/mlx5: Lag, add initial logic for shared FDB As shared FDB requires changes in two subsystems first expose the needed core functions so the RDMA side can be changed. mlx5_lag_is_master(): return true if a given mlx5 device is the lag master. mlx5_lag_is_shared_fdb(): Returns true if the lag mode is shared FDB. mlx5_lag_get_peer_mdev(): Return the peer mdev in lag. The mentioned functions will be used by downstream patches in order to add support for shared FDB for the RDMA side. Signed-off-by: Mark Bloch Reviewed-by: Mark Zhang Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1efe37466969..af4dd6e9f97f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1138,6 +1138,8 @@ bool mlx5_lag_is_roce(struct mlx5_core_dev *dev); bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev); bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); +bool mlx5_lag_is_master(struct mlx5_core_dev *dev); +bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev); struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev); u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev, struct net_device *slave); @@ -1145,6 +1147,7 @@ int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, u64 *values, int num_counters, size_t *offsets); +struct mlx5_core_dev *mlx5_lag_get_peer_mdev(struct mlx5_core_dev *dev); struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev); void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up); int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, -- cgit v1.2.3 From 979bf468fc543444eb750c8f8817407f509bd504 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 3 Aug 2021 16:19:49 -0700 Subject: {net, RDMA}/mlx5: Extend send to vport rules In shared FDB there is only one eswitch which is active and it receives traffic from all representors and all vports in the HCA. While the Ethernet representor will always reside on its native PF the IB representor will not. Extend send to vport rule creation to support such flows. Need to account for source vport that sends the traffic (on which the representors resides) and the target eswitch the traffic which reach. Signed-off-by: Mark Bloch Reviewed-by: Mark Zhang Signed-off-by: Saeed Mahameed --- include/linux/mlx5/eswitch.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index c2a34ff85188..0bfcf7b8ecf9 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -63,6 +63,7 @@ struct mlx5_eswitch_rep *mlx5_eswitch_vport_rep(struct mlx5_eswitch *esw, void *mlx5_eswitch_uplink_get_proto_dev(struct mlx5_eswitch *esw, u8 rep_type); struct mlx5_flow_handle * mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw, + struct mlx5_eswitch *from_esw, struct mlx5_eswitch_rep *rep, u32 sqn); #ifdef CONFIG_MLX5_ESWITCH -- cgit v1.2.3 From c8e6a9e6d6bb29db08e0b69ae97f1e46ccc5691c Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 3 Aug 2021 16:19:54 -0700 Subject: net/mlx5: E-Switch, Add event callback for representors This callback will allow to notify representors about relevant events when in OFFLOADS mode. In downstream patches, this will be used to notify about PAIR/UNPAIR devcom events. Signed-off-by: Mark Bloch Reviewed-by: Mark Zhang Signed-off-by: Saeed Mahameed --- include/linux/mlx5/eswitch.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 0bfcf7b8ecf9..4ab5c1fc1270 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -29,11 +29,20 @@ enum { REP_LOADED, }; +enum mlx5_switchdev_event { + MLX5_SWITCHDEV_EVENT_PAIR, + MLX5_SWITCHDEV_EVENT_UNPAIR, +}; + struct mlx5_eswitch_rep; struct mlx5_eswitch_rep_ops { int (*load)(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep); void (*unload)(struct mlx5_eswitch_rep *rep); void *(*get_proto_dev)(struct mlx5_eswitch_rep *rep); + int (*event)(struct mlx5_eswitch *esw, + struct mlx5_eswitch_rep *rep, + enum mlx5_switchdev_event event, + void *data); }; struct mlx5_eswitch_rep_data { -- cgit v1.2.3 From fdacd57c79b79a03c7ca88f706ad9fb7b46831c1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 3 Aug 2021 16:47:19 +0200 Subject: netfilter: x_tables: never register tables by default For historical reasons x_tables still register tables by default in the initial namespace. Only newly created net namespaces add the hook on demand. This means that the init_net always pays hook cost, even if no filtering rules are added (e.g. only used inside a single netns). Note that the hooks are added even when 'iptables -L' is called. This is because there is no way to tell 'iptables -A' and 'iptables -L' apart at kernel level. The only solution would be to register the table, but delay hook registration until the first rule gets added (or policy gets changed). That however means that counters are not hooked either, so 'iptables -L' would always show 0-counters even when traffic is flowing which might be unexpected. This keeps table and hook registration consistent with what is already done in non-init netns: first iptables(-save) invocation registers both table and hooks. This applies the same solution adopted for ebtables. All tables register a template that contains the l3 family, the name and a constructor function that is called when the initial table has to be added. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 28d7027cd460..5897f3dbaf7c 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -238,9 +238,6 @@ struct xt_table { u_int8_t af; /* address/protocol family */ int priority; /* hook order */ - /* called when table is needed in the given netns */ - int (*table_init)(struct net *net); - /* A unique name... */ const char name[XT_TABLE_MAXNAMELEN]; }; @@ -452,6 +449,9 @@ xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu) struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *); +int xt_register_template(const struct xt_table *t, int(*table_init)(struct net *net)); +void xt_unregister_template(const struct xt_table *t); + #ifdef CONFIG_NETFILTER_XTABLES_COMPAT #include -- cgit v1.2.3 From 879af96ffd72706c6e3278ea6b45b0b0e37ec5d7 Mon Sep 17 00:00:00 2001 From: Jussi Maki Date: Sat, 31 Jul 2021 05:57:33 +0000 Subject: net, core: Add support for XDP redirection to slave device This adds the ndo_xdp_get_xmit_slave hook for transforming XDP_TX into XDP_REDIRECT after BPF program run when the ingress device is a bond slave. The dev_xdp_prog_count is exposed so that slave devices can be checked for loaded XDP programs in order to avoid the situation where both bond master and slave have programs loaded according to xdp_state. Signed-off-by: Jussi Maki Signed-off-by: Daniel Borkmann Cc: Jay Vosburgh Cc: Veaceslav Falico Cc: Andy Gospodarek Link: https://lore.kernel.org/bpf/20210731055738.16820-3-joamaki@gmail.com --- include/linux/filter.h | 13 ++++++++++++- include/linux/netdevice.h | 6 ++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index ff698c9d1c94..1797e8506929 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -776,6 +776,10 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, DECLARE_BPF_DISPATCHER(xdp) +DECLARE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); + +u32 xdp_master_redirect(struct xdp_buff *xdp); + static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, struct xdp_buff *xdp) { @@ -783,7 +787,14 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, * under local_bh_disable(), which provides the needed RCU protection * for accessing map entries. */ - return __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); + u32 act = __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); + + if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) { + if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev)) + act = xdp_master_redirect(xdp); + } + + return act; } void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d63a94ecbf3b..02c6e8e10c86 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1330,6 +1330,9 @@ struct netdev_net_notifier { * that got dropped are freed/returned via xdp_return_frame(). * Returns negative number, means general error invoking ndo, meaning * no frames were xmit'ed and core-caller will free all frames. + * struct net_device *(*ndo_xdp_get_xmit_slave)(struct net_device *dev, + * struct xdp_buff *xdp); + * Get the xmit slave of master device based on the xdp_buff. * int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); * This function is used to wake up the softirq, ksoftirqd or kthread * responsible for sending and/or receiving packets on a specific @@ -1557,6 +1560,8 @@ struct net_device_ops { int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp, u32 flags); + struct net_device * (*ndo_xdp_get_xmit_slave)(struct net_device *dev, + struct xdp_buff *xdp); int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); @@ -4087,6 +4092,7 @@ typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags); int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +u8 dev_xdp_prog_count(struct net_device *dev); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); -- cgit v1.2.3 From 57f05bc2ab2443b89c2e2562c05053bcc7d30e8b Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Fri, 6 Aug 2021 10:46:19 +0800 Subject: page_pool: keep pp info as long as page pool owns the page Currently, page->pp is cleared and set everytime the page is recycled, which is unnecessary. So only set the page->pp when the page is added to the page pool and only clear it when the page is released from the page pool. This is also a preparation to support allocating frag page in page pool. Reviewed-by: Ilias Apalodimas Signed-off-by: Yunsheng Lin Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 783cc2368bb1..6bdb0db3e825 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4712,11 +4712,9 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb) } #ifdef CONFIG_PAGE_POOL -static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page, - struct page_pool *pp) +static inline void skb_mark_for_recycle(struct sk_buff *skb) { skb->pp_recycle = 1; - page_pool_store_mem_info(page, pp); } #endif -- cgit v1.2.3 From 0e9d2a0a3a836c37528899010e73b5be8111753e Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Fri, 6 Aug 2021 10:46:20 +0800 Subject: page_pool: add interface to manipulate frag count in page pool For 32 bit systems with 64 bit dma, dma_addr[1] is used to store the upper 32 bit dma addr, those system should be rare those days. For normal system, the dma_addr[1] in 'struct page' is not used, so we can reuse dma_addr[1] for storing frag count, which means how many frags this page might be splited to. In order to simplify the page frag support in the page pool, the PAGE_POOL_DMA_USE_PP_FRAG_COUNT macro is added to indicate the 32 bit systems with 64 bit dma, and the page frag support in page pool is disabled for such system. The newly added page_pool_set_frag_count() is called to reserve the maximum frag count before any page frag is passed to the user. The page_pool_atomic_sub_frag_count_return() is called when user is done with the page frag. Signed-off-by: Yunsheng Lin Signed-off-by: Jakub Kicinski --- include/linux/mm_types.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 52bbd2b7cb46..7f8ee09c711f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -103,11 +103,19 @@ struct page { unsigned long pp_magic; struct page_pool *pp; unsigned long _pp_mapping_pad; - /** - * @dma_addr: might require a 64-bit value on - * 32-bit architectures. - */ - unsigned long dma_addr[2]; + unsigned long dma_addr; + union { + /** + * dma_addr_upper: might require a 64-bit + * value on 32-bit architectures. + */ + unsigned long dma_addr_upper; + /** + * For frag page support, not supported in + * 32-bit architectures with 64-bit DMA. + */ + atomic_long_t pp_frag_count; + }; }; struct { /* slab, slob and slub */ union { -- cgit v1.2.3 From 39c538d64479c949aaeca4fe73d2226f715adfb7 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 30 Jul 2021 11:03:00 +0800 Subject: net/mlx5: Fix typo in comments Fix typo: *vectores ==> vectors *realeased ==> released *erros ==> errors *namepsace ==> namespace *trafic ==> traffic *proccessed ==> processed *retore ==> restore *Currenlty ==> Currently *crated ==> created *chane ==> change *cannnot ==> cannot *usuallly ==> usually *failes ==> fails *importent ==> important *reenabled ==> re-enabled *alocation ==> allocation *recived ==> received *tanslation ==> translation Signed-off-by: Cai Huoqing Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 2 +- include/linux/mlx5/driver.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 0025913505ab..1e9d55dc1a9c 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1038,7 +1038,7 @@ enum { struct mlx5_mkey_seg { /* This is a two bit field occupying bits 31-30. * bit 31 is always 0, - * bit 30 is zero for regular MRs and 1 (e.g free) for UMRs that do not have tanslation + * bit 30 is zero for regular MRs and 1 (e.g free) for UMRs that do not have translation */ u8 status; u8 pcie_control; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index af4dd6e9f97f..524051d1b2e3 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -581,7 +581,7 @@ struct mlx5_priv { /* end: qp staff */ /* start: alloc staff */ - /* protect buffer alocation according to numa node */ + /* protect buffer allocation according to numa node */ struct mutex alloc_mutex; int numa_node; @@ -1111,7 +1111,7 @@ static inline u8 mlx5_mkey_variant(u32 mkey) } /* Async-atomic event notifier used by mlx5 core to forward FW - * evetns recived from event queue to mlx5 consumers. + * evetns received from event queue to mlx5 consumers. * Optimise event queue dipatching. */ int mlx5_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb); -- cgit v1.2.3 From 8e792700b994a4b79abe1303eb379bbd1f4212be Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 1 Aug 2021 11:37:57 +0300 Subject: net/mlx5: Delete impossible dev->state checks New mlx5_core device structure is allocated through devlink_alloc with\ kzalloc and that ensures that all fields are equal to zero and it includes ->state too. That means that checks of that field in the mlx5_init_one() is completely redundant, because that function is called only once in the begging of mlx5_core_dev lifetime. PCI: .probe() -> probe_one() -> mlx5_init_one() The recovery flow can't run at that time or before it, because relevant work initialized later in mlx5_init_once(). Such initialization flow ensures that dev->state can't be MLX5_DEVICE_STATE_UNINITIALIZED at all, so remove such impossible checks. Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 524051d1b2e3..2b5c5604b091 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -623,8 +623,7 @@ struct mlx5_priv { }; enum mlx5_device_state { - MLX5_DEVICE_STATE_UNINITIALIZED, - MLX5_DEVICE_STATE_UP, + MLX5_DEVICE_STATE_UP = 1, MLX5_DEVICE_STATE_INTERNAL_ERROR, }; -- cgit v1.2.3 From 5958a6fad623ad3b67a9e4d8dbd5f1874cc7039e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 13 Jul 2021 12:36:05 +0300 Subject: net/mlx5: Reorganize current and maximal capabilities to be per-type In the current code, the current and maximal capabilities are maintained in separate arrays which are both per type. In order to allow the creation of such a basic structure as a dynamically allocated array, we move curr and max fields to a unified structure so that specific capabilities can be allocated as one unit. Signed-off-by: Parav Pandit Reviewed-by: Leon Romanovsky Reviewed-by: Shay Drory Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 66 ++++++++++++++++++++++----------------------- include/linux/mlx5/driver.h | 8 ++++-- 2 files changed, 39 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 1e9d55dc1a9c..2736f12bb57c 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1213,55 +1213,55 @@ enum mlx5_qcam_feature_groups { /* GET Dev Caps macros */ #define MLX5_CAP_GEN(mdev, cap) \ - MLX5_GET(cmd_hca_cap, mdev->caps.hca_cur[MLX5_CAP_GENERAL], cap) + MLX5_GET(cmd_hca_cap, mdev->caps.hca[MLX5_CAP_GENERAL].cur, cap) #define MLX5_CAP_GEN_64(mdev, cap) \ - MLX5_GET64(cmd_hca_cap, mdev->caps.hca_cur[MLX5_CAP_GENERAL], cap) + MLX5_GET64(cmd_hca_cap, mdev->caps.hca[MLX5_CAP_GENERAL].cur, cap) #define MLX5_CAP_GEN_MAX(mdev, cap) \ - MLX5_GET(cmd_hca_cap, mdev->caps.hca_max[MLX5_CAP_GENERAL], cap) + MLX5_GET(cmd_hca_cap, mdev->caps.hca[MLX5_CAP_GENERAL].max, cap) #define MLX5_CAP_GEN_2(mdev, cap) \ - MLX5_GET(cmd_hca_cap_2, mdev->caps.hca_cur[MLX5_CAP_GENERAL_2], cap) + MLX5_GET(cmd_hca_cap_2, mdev->caps.hca[MLX5_CAP_GENERAL_2].cur, cap) #define MLX5_CAP_GEN_2_64(mdev, cap) \ - MLX5_GET64(cmd_hca_cap_2, mdev->caps.hca_cur[MLX5_CAP_GENERAL_2], cap) + MLX5_GET64(cmd_hca_cap_2, mdev->caps.hca[MLX5_CAP_GENERAL_2].cur, cap) #define MLX5_CAP_GEN_2_MAX(mdev, cap) \ - MLX5_GET(cmd_hca_cap_2, mdev->caps.hca_max[MLX5_CAP_GENERAL_2], cap) + MLX5_GET(cmd_hca_cap_2, mdev->caps.hca[MLX5_CAP_GENERAL_2].max, cap) #define MLX5_CAP_ETH(mdev, cap) \ MLX5_GET(per_protocol_networking_offload_caps,\ - mdev->caps.hca_cur[MLX5_CAP_ETHERNET_OFFLOADS], cap) + mdev->caps.hca[MLX5_CAP_ETHERNET_OFFLOADS].cur, cap) #define MLX5_CAP_ETH_MAX(mdev, cap) \ MLX5_GET(per_protocol_networking_offload_caps,\ - mdev->caps.hca_max[MLX5_CAP_ETHERNET_OFFLOADS], cap) + mdev->caps.hca[MLX5_CAP_ETHERNET_OFFLOADS].max, cap) #define MLX5_CAP_IPOIB_ENHANCED(mdev, cap) \ MLX5_GET(per_protocol_networking_offload_caps,\ - mdev->caps.hca_cur[MLX5_CAP_IPOIB_ENHANCED_OFFLOADS], cap) + mdev->caps.hca[MLX5_CAP_IPOIB_ENHANCED_OFFLOADS].cur, cap) #define MLX5_CAP_ROCE(mdev, cap) \ - MLX5_GET(roce_cap, mdev->caps.hca_cur[MLX5_CAP_ROCE], cap) + MLX5_GET(roce_cap, mdev->caps.hca[MLX5_CAP_ROCE].cur, cap) #define MLX5_CAP_ROCE_MAX(mdev, cap) \ - MLX5_GET(roce_cap, mdev->caps.hca_max[MLX5_CAP_ROCE], cap) + MLX5_GET(roce_cap, mdev->caps.hca[MLX5_CAP_ROCE].max, cap) #define MLX5_CAP_ATOMIC(mdev, cap) \ - MLX5_GET(atomic_caps, mdev->caps.hca_cur[MLX5_CAP_ATOMIC], cap) + MLX5_GET(atomic_caps, mdev->caps.hca[MLX5_CAP_ATOMIC].cur, cap) #define MLX5_CAP_ATOMIC_MAX(mdev, cap) \ - MLX5_GET(atomic_caps, mdev->caps.hca_max[MLX5_CAP_ATOMIC], cap) + MLX5_GET(atomic_caps, mdev->caps.hca[MLX5_CAP_ATOMIC].max, cap) #define MLX5_CAP_FLOWTABLE(mdev, cap) \ - MLX5_GET(flow_table_nic_cap, mdev->caps.hca_cur[MLX5_CAP_FLOW_TABLE], cap) + MLX5_GET(flow_table_nic_cap, mdev->caps.hca[MLX5_CAP_FLOW_TABLE].cur, cap) #define MLX5_CAP64_FLOWTABLE(mdev, cap) \ - MLX5_GET64(flow_table_nic_cap, (mdev)->caps.hca_cur[MLX5_CAP_FLOW_TABLE], cap) + MLX5_GET64(flow_table_nic_cap, (mdev)->caps.hca[MLX5_CAP_FLOW_TABLE].cur, cap) #define MLX5_CAP_FLOWTABLE_MAX(mdev, cap) \ - MLX5_GET(flow_table_nic_cap, mdev->caps.hca_max[MLX5_CAP_FLOW_TABLE], cap) + MLX5_GET(flow_table_nic_cap, mdev->caps.hca[MLX5_CAP_FLOW_TABLE].max, cap) #define MLX5_CAP_FLOWTABLE_NIC_RX(mdev, cap) \ MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.cap) @@ -1301,11 +1301,11 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_ESW_FLOWTABLE(mdev, cap) \ MLX5_GET(flow_table_eswitch_cap, \ - mdev->caps.hca_cur[MLX5_CAP_ESWITCH_FLOW_TABLE], cap) + mdev->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE].cur, cap) #define MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, cap) \ MLX5_GET(flow_table_eswitch_cap, \ - mdev->caps.hca_max[MLX5_CAP_ESWITCH_FLOW_TABLE], cap) + mdev->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE].max, cap) #define MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) \ MLX5_CAP_ESW_FLOWTABLE(mdev, flow_table_properties_nic_esw_fdb.cap) @@ -1327,31 +1327,31 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_ESW(mdev, cap) \ MLX5_GET(e_switch_cap, \ - mdev->caps.hca_cur[MLX5_CAP_ESWITCH], cap) + mdev->caps.hca[MLX5_CAP_ESWITCH].cur, cap) #define MLX5_CAP64_ESW_FLOWTABLE(mdev, cap) \ MLX5_GET64(flow_table_eswitch_cap, \ - (mdev)->caps.hca_cur[MLX5_CAP_ESWITCH_FLOW_TABLE], cap) + (mdev)->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE].cur, cap) #define MLX5_CAP_ESW_MAX(mdev, cap) \ MLX5_GET(e_switch_cap, \ - mdev->caps.hca_max[MLX5_CAP_ESWITCH], cap) + mdev->caps.hca[MLX5_CAP_ESWITCH].max, cap) #define MLX5_CAP_ODP(mdev, cap)\ - MLX5_GET(odp_cap, mdev->caps.hca_cur[MLX5_CAP_ODP], cap) + MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP].cur, cap) #define MLX5_CAP_ODP_MAX(mdev, cap)\ - MLX5_GET(odp_cap, mdev->caps.hca_max[MLX5_CAP_ODP], cap) + MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP].max, cap) #define MLX5_CAP_VECTOR_CALC(mdev, cap) \ MLX5_GET(vector_calc_cap, \ - mdev->caps.hca_cur[MLX5_CAP_VECTOR_CALC], cap) + mdev->caps.hca[MLX5_CAP_VECTOR_CALC].cur, cap) #define MLX5_CAP_QOS(mdev, cap)\ - MLX5_GET(qos_cap, mdev->caps.hca_cur[MLX5_CAP_QOS], cap) + MLX5_GET(qos_cap, mdev->caps.hca[MLX5_CAP_QOS].cur, cap) #define MLX5_CAP_DEBUG(mdev, cap)\ - MLX5_GET(debug_cap, mdev->caps.hca_cur[MLX5_CAP_DEBUG], cap) + MLX5_GET(debug_cap, mdev->caps.hca[MLX5_CAP_DEBUG].cur, cap) #define MLX5_CAP_PCAM_FEATURE(mdev, fld) \ MLX5_GET(pcam_reg, (mdev)->caps.pcam, feature_cap_mask.enhanced_features.fld) @@ -1387,27 +1387,27 @@ enum mlx5_qcam_feature_groups { MLX5_GET64(fpga_cap, (mdev)->caps.fpga, cap) #define MLX5_CAP_DEV_MEM(mdev, cap)\ - MLX5_GET(device_mem_cap, mdev->caps.hca_cur[MLX5_CAP_DEV_MEM], cap) + MLX5_GET(device_mem_cap, mdev->caps.hca[MLX5_CAP_DEV_MEM].cur, cap) #define MLX5_CAP64_DEV_MEM(mdev, cap)\ - MLX5_GET64(device_mem_cap, mdev->caps.hca_cur[MLX5_CAP_DEV_MEM], cap) + MLX5_GET64(device_mem_cap, mdev->caps.hca[MLX5_CAP_DEV_MEM].cur, cap) #define MLX5_CAP_TLS(mdev, cap) \ - MLX5_GET(tls_cap, (mdev)->caps.hca_cur[MLX5_CAP_TLS], cap) + MLX5_GET(tls_cap, (mdev)->caps.hca[MLX5_CAP_TLS].cur, cap) #define MLX5_CAP_DEV_EVENT(mdev, cap)\ - MLX5_ADDR_OF(device_event_cap, (mdev)->caps.hca_cur[MLX5_CAP_DEV_EVENT], cap) + MLX5_ADDR_OF(device_event_cap, (mdev)->caps.hca[MLX5_CAP_DEV_EVENT].cur, cap) #define MLX5_CAP_DEV_VDPA_EMULATION(mdev, cap)\ MLX5_GET(virtio_emulation_cap, \ - (mdev)->caps.hca_cur[MLX5_CAP_VDPA_EMULATION], cap) + (mdev)->caps.hca[MLX5_CAP_VDPA_EMULATION].cur, cap) #define MLX5_CAP64_DEV_VDPA_EMULATION(mdev, cap)\ MLX5_GET64(virtio_emulation_cap, \ - (mdev)->caps.hca_cur[MLX5_CAP_VDPA_EMULATION], cap) + (mdev)->caps.hca[MLX5_CAP_VDPA_EMULATION].cur, cap) #define MLX5_CAP_IPSEC(mdev, cap)\ - MLX5_GET(ipsec_cap, (mdev)->caps.hca_cur[MLX5_CAP_IPSEC], cap) + MLX5_GET(ipsec_cap, (mdev)->caps.hca[MLX5_CAP_IPSEC].cur, cap) enum { MLX5_CMD_STAT_OK = 0x0, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 2b5c5604b091..854443ea812c 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -729,6 +729,11 @@ struct mlx5_profile { } mr_cache[MAX_MR_CACHE_ENTRIES]; }; +struct mlx5_hca_cap { + u32 cur[MLX5_UN_SZ_DW(hca_cap_union)]; + u32 max[MLX5_UN_SZ_DW(hca_cap_union)]; +}; + struct mlx5_core_dev { struct device *device; enum mlx5_coredev_type coredev_type; @@ -740,8 +745,7 @@ struct mlx5_core_dev { char board_id[MLX5_BOARD_ID_LEN]; struct mlx5_cmd cmd; struct { - u32 hca_cur[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)]; - u32 hca_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)]; + struct mlx5_hca_cap hca[MLX5_CAP_NUM]; u32 pcam[MLX5_ST_SZ_DW(pcam_reg)]; u32 mcam[MLX5_MCAM_REGS_NUM][MLX5_ST_SZ_DW(mcam_reg)]; u32 fpga[MLX5_ST_SZ_DW(fpga_cap)]; -- cgit v1.2.3 From 48f02eef7f764f33e520ed8009d293396ca690cd Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 13 Jul 2021 14:17:03 +0300 Subject: net/mlx5: Allocate individual capability Currently mlx5_core_dev contains array of capabilities. It contains 19 valid capabilities of the device, 2 reserved entries and 12 holes. Due to this for 14 unused entries, mlx5_core_dev allocates 14 * 8K = 112K bytes of memory which is never used. Due to this mlx5_core_dev structure size is 270Kbytes odd. This allocation further aligns to next power of 2 to 512Kbytes. By skipping non-existent entries, (a) 112Kbyte is saved, (b) mlx5_core_dev reduces to 8KB with alignment (c) 350KB saved in alignment In future individual capability allocation can be used to skip its allocation when such capability is disabled at the device level. This patch prepares mlx5_core_dev to hold capability using a pointer instead of inline array. Signed-off-by: Parav Pandit Reviewed-by: Leon Romanovsky Reviewed-by: Shay Drory Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 69 +++++++++++++++++++++++---------------------- include/linux/mlx5/driver.h | 2 +- 2 files changed, 37 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 2736f12bb57c..66eaf0aa7f69 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1157,6 +1157,9 @@ enum mlx5_cap_mode { HCA_CAP_OPMOD_GET_CUR = 1, }; +/* Any new cap addition must update mlx5_hca_caps_alloc() to allocate + * capability memory. + */ enum mlx5_cap_type { MLX5_CAP_GENERAL = 0, MLX5_CAP_ETHERNET_OFFLOADS, @@ -1213,55 +1216,55 @@ enum mlx5_qcam_feature_groups { /* GET Dev Caps macros */ #define MLX5_CAP_GEN(mdev, cap) \ - MLX5_GET(cmd_hca_cap, mdev->caps.hca[MLX5_CAP_GENERAL].cur, cap) + MLX5_GET(cmd_hca_cap, mdev->caps.hca[MLX5_CAP_GENERAL]->cur, cap) #define MLX5_CAP_GEN_64(mdev, cap) \ - MLX5_GET64(cmd_hca_cap, mdev->caps.hca[MLX5_CAP_GENERAL].cur, cap) + MLX5_GET64(cmd_hca_cap, mdev->caps.hca[MLX5_CAP_GENERAL]->cur, cap) #define MLX5_CAP_GEN_MAX(mdev, cap) \ - MLX5_GET(cmd_hca_cap, mdev->caps.hca[MLX5_CAP_GENERAL].max, cap) + MLX5_GET(cmd_hca_cap, mdev->caps.hca[MLX5_CAP_GENERAL]->max, cap) #define MLX5_CAP_GEN_2(mdev, cap) \ - MLX5_GET(cmd_hca_cap_2, mdev->caps.hca[MLX5_CAP_GENERAL_2].cur, cap) + MLX5_GET(cmd_hca_cap_2, mdev->caps.hca[MLX5_CAP_GENERAL_2]->cur, cap) #define MLX5_CAP_GEN_2_64(mdev, cap) \ - MLX5_GET64(cmd_hca_cap_2, mdev->caps.hca[MLX5_CAP_GENERAL_2].cur, cap) + MLX5_GET64(cmd_hca_cap_2, mdev->caps.hca[MLX5_CAP_GENERAL_2]->cur, cap) #define MLX5_CAP_GEN_2_MAX(mdev, cap) \ - MLX5_GET(cmd_hca_cap_2, mdev->caps.hca[MLX5_CAP_GENERAL_2].max, cap) + MLX5_GET(cmd_hca_cap_2, mdev->caps.hca[MLX5_CAP_GENERAL_2]->max, cap) #define MLX5_CAP_ETH(mdev, cap) \ MLX5_GET(per_protocol_networking_offload_caps,\ - mdev->caps.hca[MLX5_CAP_ETHERNET_OFFLOADS].cur, cap) + mdev->caps.hca[MLX5_CAP_ETHERNET_OFFLOADS]->cur, cap) #define MLX5_CAP_ETH_MAX(mdev, cap) \ MLX5_GET(per_protocol_networking_offload_caps,\ - mdev->caps.hca[MLX5_CAP_ETHERNET_OFFLOADS].max, cap) + mdev->caps.hca[MLX5_CAP_ETHERNET_OFFLOADS]->max, cap) #define MLX5_CAP_IPOIB_ENHANCED(mdev, cap) \ MLX5_GET(per_protocol_networking_offload_caps,\ - mdev->caps.hca[MLX5_CAP_IPOIB_ENHANCED_OFFLOADS].cur, cap) + mdev->caps.hca[MLX5_CAP_IPOIB_ENHANCED_OFFLOADS]->cur, cap) #define MLX5_CAP_ROCE(mdev, cap) \ - MLX5_GET(roce_cap, mdev->caps.hca[MLX5_CAP_ROCE].cur, cap) + MLX5_GET(roce_cap, mdev->caps.hca[MLX5_CAP_ROCE]->cur, cap) #define MLX5_CAP_ROCE_MAX(mdev, cap) \ - MLX5_GET(roce_cap, mdev->caps.hca[MLX5_CAP_ROCE].max, cap) + MLX5_GET(roce_cap, mdev->caps.hca[MLX5_CAP_ROCE]->max, cap) #define MLX5_CAP_ATOMIC(mdev, cap) \ - MLX5_GET(atomic_caps, mdev->caps.hca[MLX5_CAP_ATOMIC].cur, cap) + MLX5_GET(atomic_caps, mdev->caps.hca[MLX5_CAP_ATOMIC]->cur, cap) #define MLX5_CAP_ATOMIC_MAX(mdev, cap) \ - MLX5_GET(atomic_caps, mdev->caps.hca[MLX5_CAP_ATOMIC].max, cap) + MLX5_GET(atomic_caps, mdev->caps.hca[MLX5_CAP_ATOMIC]->max, cap) #define MLX5_CAP_FLOWTABLE(mdev, cap) \ - MLX5_GET(flow_table_nic_cap, mdev->caps.hca[MLX5_CAP_FLOW_TABLE].cur, cap) + MLX5_GET(flow_table_nic_cap, mdev->caps.hca[MLX5_CAP_FLOW_TABLE]->cur, cap) #define MLX5_CAP64_FLOWTABLE(mdev, cap) \ - MLX5_GET64(flow_table_nic_cap, (mdev)->caps.hca[MLX5_CAP_FLOW_TABLE].cur, cap) + MLX5_GET64(flow_table_nic_cap, (mdev)->caps.hca[MLX5_CAP_FLOW_TABLE]->cur, cap) #define MLX5_CAP_FLOWTABLE_MAX(mdev, cap) \ - MLX5_GET(flow_table_nic_cap, mdev->caps.hca[MLX5_CAP_FLOW_TABLE].max, cap) + MLX5_GET(flow_table_nic_cap, mdev->caps.hca[MLX5_CAP_FLOW_TABLE]->max, cap) #define MLX5_CAP_FLOWTABLE_NIC_RX(mdev, cap) \ MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.cap) @@ -1301,11 +1304,11 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_ESW_FLOWTABLE(mdev, cap) \ MLX5_GET(flow_table_eswitch_cap, \ - mdev->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE].cur, cap) + mdev->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE]->cur, cap) #define MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, cap) \ MLX5_GET(flow_table_eswitch_cap, \ - mdev->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE].max, cap) + mdev->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE]->max, cap) #define MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) \ MLX5_CAP_ESW_FLOWTABLE(mdev, flow_table_properties_nic_esw_fdb.cap) @@ -1327,31 +1330,31 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_ESW(mdev, cap) \ MLX5_GET(e_switch_cap, \ - mdev->caps.hca[MLX5_CAP_ESWITCH].cur, cap) + mdev->caps.hca[MLX5_CAP_ESWITCH]->cur, cap) #define MLX5_CAP64_ESW_FLOWTABLE(mdev, cap) \ MLX5_GET64(flow_table_eswitch_cap, \ - (mdev)->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE].cur, cap) + (mdev)->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE]->cur, cap) #define MLX5_CAP_ESW_MAX(mdev, cap) \ MLX5_GET(e_switch_cap, \ - mdev->caps.hca[MLX5_CAP_ESWITCH].max, cap) + mdev->caps.hca[MLX5_CAP_ESWITCH]->max, cap) #define MLX5_CAP_ODP(mdev, cap)\ - MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP].cur, cap) + MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, cap) #define MLX5_CAP_ODP_MAX(mdev, cap)\ - MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP].max, cap) + MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->max, cap) #define MLX5_CAP_VECTOR_CALC(mdev, cap) \ MLX5_GET(vector_calc_cap, \ - mdev->caps.hca[MLX5_CAP_VECTOR_CALC].cur, cap) + mdev->caps.hca[MLX5_CAP_VECTOR_CALC]->cur, cap) #define MLX5_CAP_QOS(mdev, cap)\ - MLX5_GET(qos_cap, mdev->caps.hca[MLX5_CAP_QOS].cur, cap) + MLX5_GET(qos_cap, mdev->caps.hca[MLX5_CAP_QOS]->cur, cap) #define MLX5_CAP_DEBUG(mdev, cap)\ - MLX5_GET(debug_cap, mdev->caps.hca[MLX5_CAP_DEBUG].cur, cap) + MLX5_GET(debug_cap, mdev->caps.hca[MLX5_CAP_DEBUG]->cur, cap) #define MLX5_CAP_PCAM_FEATURE(mdev, fld) \ MLX5_GET(pcam_reg, (mdev)->caps.pcam, feature_cap_mask.enhanced_features.fld) @@ -1387,27 +1390,27 @@ enum mlx5_qcam_feature_groups { MLX5_GET64(fpga_cap, (mdev)->caps.fpga, cap) #define MLX5_CAP_DEV_MEM(mdev, cap)\ - MLX5_GET(device_mem_cap, mdev->caps.hca[MLX5_CAP_DEV_MEM].cur, cap) + MLX5_GET(device_mem_cap, mdev->caps.hca[MLX5_CAP_DEV_MEM]->cur, cap) #define MLX5_CAP64_DEV_MEM(mdev, cap)\ - MLX5_GET64(device_mem_cap, mdev->caps.hca[MLX5_CAP_DEV_MEM].cur, cap) + MLX5_GET64(device_mem_cap, mdev->caps.hca[MLX5_CAP_DEV_MEM]->cur, cap) #define MLX5_CAP_TLS(mdev, cap) \ - MLX5_GET(tls_cap, (mdev)->caps.hca[MLX5_CAP_TLS].cur, cap) + MLX5_GET(tls_cap, (mdev)->caps.hca[MLX5_CAP_TLS]->cur, cap) #define MLX5_CAP_DEV_EVENT(mdev, cap)\ - MLX5_ADDR_OF(device_event_cap, (mdev)->caps.hca[MLX5_CAP_DEV_EVENT].cur, cap) + MLX5_ADDR_OF(device_event_cap, (mdev)->caps.hca[MLX5_CAP_DEV_EVENT]->cur, cap) #define MLX5_CAP_DEV_VDPA_EMULATION(mdev, cap)\ MLX5_GET(virtio_emulation_cap, \ - (mdev)->caps.hca[MLX5_CAP_VDPA_EMULATION].cur, cap) + (mdev)->caps.hca[MLX5_CAP_VDPA_EMULATION]->cur, cap) #define MLX5_CAP64_DEV_VDPA_EMULATION(mdev, cap)\ MLX5_GET64(virtio_emulation_cap, \ - (mdev)->caps.hca[MLX5_CAP_VDPA_EMULATION].cur, cap) + (mdev)->caps.hca[MLX5_CAP_VDPA_EMULATION]->cur, cap) #define MLX5_CAP_IPSEC(mdev, cap)\ - MLX5_GET(ipsec_cap, (mdev)->caps.hca[MLX5_CAP_IPSEC].cur, cap) + MLX5_GET(ipsec_cap, (mdev)->caps.hca[MLX5_CAP_IPSEC]->cur, cap) enum { MLX5_CMD_STAT_OK = 0x0, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 854443ea812c..90e5f42baa50 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -745,7 +745,7 @@ struct mlx5_core_dev { char board_id[MLX5_BOARD_ID_LEN]; struct mlx5_cmd cmd; struct { - struct mlx5_hca_cap hca[MLX5_CAP_NUM]; + struct mlx5_hca_cap *hca[MLX5_CAP_NUM]; u32 pcam[MLX5_ST_SZ_DW(pcam_reg)]; u32 mcam[MLX5_MCAM_REGS_NUM][MLX5_ST_SZ_DW(mcam_reg)]; u32 fpga[MLX5_ST_SZ_DW(fpga_cap)]; -- cgit v1.2.3 From afa79d08c6c8e1901cb1547591e3ccd3ec6965d9 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Fri, 13 Aug 2021 22:57:49 +0800 Subject: net: in_irq() cleanup Replace the obsolete and ambiguos macro in_irq() with new macro in_hardirq(). Signed-off-by: Changbin Du Link: https://lore.kernel.org/r/20210813145749.86512-1-changbin.du@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bd8d5b8e2de3..2f03cd9e371a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3956,7 +3956,7 @@ void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason); /* * It is not allowed to call kfree_skb() or consume_skb() from hardware * interrupt context or with hardware interrupts being disabled. - * (in_irq() || irqs_disabled()) + * (in_hardirq() || irqs_disabled()) * * We provide four helpers that can be used in following contexts : * -- cgit v1.2.3 From e5f31552674e88bff3a4e3ca3e5357668b5f2973 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 12 Aug 2021 20:33:58 +0200 Subject: ethernet: fix PTP_1588_CLOCK dependencies The 'imply' keyword does not do what most people think it does, it only politely asks Kconfig to turn on another symbol, but does not prevent it from being disabled manually or built as a loadable module when the user is built-in. In the ICE driver, the latter now causes a link failure: aarch64-linux-ld: drivers/net/ethernet/intel/ice/ice_main.o: in function `ice_eth_ioctl': ice_main.c:(.text+0x13b0): undefined reference to `ice_ptp_get_ts_config' ice_main.c:(.text+0x13b0): relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol `ice_ptp_get_ts_config' aarch64-linux-ld: ice_main.c:(.text+0x13bc): undefined reference to `ice_ptp_set_ts_config' ice_main.c:(.text+0x13bc): relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol `ice_ptp_set_ts_config' aarch64-linux-ld: drivers/net/ethernet/intel/ice/ice_main.o: in function `ice_prepare_for_reset': ice_main.c:(.text+0x31fc): undefined reference to `ice_ptp_release' ice_main.c:(.text+0x31fc): relocation truncated to fit: R_AARCH64_CALL26 against undefined symbol `ice_ptp_release' aarch64-linux-ld: drivers/net/ethernet/intel/ice/ice_main.o: in function `ice_rebuild': This is a recurring problem in many drivers, and we have discussed it several times befores, without reaching a consensus. I'm providing a link to the previous email thread for reference, which discusses some related problems. To solve the dependency issue better than the 'imply' keyword, introduce a separate Kconfig symbol "CONFIG_PTP_1588_CLOCK_OPTIONAL" that any driver can depend on if it is able to use PTP support when available, but works fine without it. Whenever CONFIG_PTP_1588_CLOCK=m, those drivers are then prevented from being built-in, the same way as with a 'depends on PTP_1588_CLOCK || !PTP_1588_CLOCK' dependency that does the same trick, but that can be rather confusing when you first see it. Since this should cover the dependencies correctly, the IS_REACHABLE() hack in the header is no longer needed now, and can be turned back into a normal IS_ENABLED() check. Any driver that gets the dependency wrong will now cause a link time failure rather than being unable to use PTP support when that is in a loadable module. However, the two recently added ptp_get_vclocks_index() and ptp_convert_timestamp() interfaces are only called from builtin code with ethtool and socket timestamps, so keep the current behavior by stubbing those out completely when PTP is in a loadable module. This should be addressed properly in a follow-up. As Richard suggested, we may want to actually turn PTP support into a 'bool' option later on, preventing it from being a loadable module altogether, which would be one way to solve the problem with the ethtool interface. Fixes: 06c16d89d2cb ("ice: register 1588 PTP clock device object for E810 devices") Link: https://lore.kernel.org/netdev/20210804121318.337276-1-arnd@kernel.org/ Link: https://lore.kernel.org/netdev/CAK8P3a06enZOf=XyZ+zcAwBczv41UuCTz+=0FMf2gBz1_cOnZQ@mail.gmail.com/ Link: https://lore.kernel.org/netdev/CAK8P3a3=eOxE-K25754+fB_-i_0BZzf9a9RfPTX3ppSwu9WZXw@mail.gmail.com/ Link: https://lore.kernel.org/netdev/20210726084540.3282344-1-arnd@kernel.org/ Acked-by: Shannon Nelson Acked-by: Jacob Keller Acked-by: Richard Cochran Reviewed-by: Vladimir Oltean Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20210812183509.1362782-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/ptp_clock_kernel.h | 48 +++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 71fac9237725..2e5565067355 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -215,7 +215,7 @@ static inline long scaled_ppm_to_ppb(long ppm) return (long)ppb; } -#if IS_REACHABLE(CONFIG_PTP_1588_CLOCK) +#if IS_ENABLED(CONFIG_PTP_1588_CLOCK) /** * ptp_clock_register() - register a PTP hardware clock driver @@ -307,6 +307,33 @@ int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay); */ void ptp_cancel_worker_sync(struct ptp_clock *ptp); +#else +static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, + struct device *parent) +{ return NULL; } +static inline int ptp_clock_unregister(struct ptp_clock *ptp) +{ return 0; } +static inline void ptp_clock_event(struct ptp_clock *ptp, + struct ptp_clock_event *event) +{ } +static inline int ptp_clock_index(struct ptp_clock *ptp) +{ return -1; } +static inline int ptp_find_pin(struct ptp_clock *ptp, + enum ptp_pin_function func, unsigned int chan) +{ return -1; } +static inline int ptp_schedule_worker(struct ptp_clock *ptp, + unsigned long delay) +{ return -EOPNOTSUPP; } +static inline void ptp_cancel_worker_sync(struct ptp_clock *ptp) +{ } +#endif + +#if IS_BUILTIN(CONFIG_PTP_1588_CLOCK) +/* + * These are called by the network core, and don't work if PTP is in + * a loadable module. + */ + /** * ptp_get_vclocks_index() - get all vclocks index on pclock, and * caller is responsible to free memory @@ -327,26 +354,7 @@ int ptp_get_vclocks_index(int pclock_index, int **vclock_index); */ void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps, int vclock_index); - #else -static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, - struct device *parent) -{ return NULL; } -static inline int ptp_clock_unregister(struct ptp_clock *ptp) -{ return 0; } -static inline void ptp_clock_event(struct ptp_clock *ptp, - struct ptp_clock_event *event) -{ } -static inline int ptp_clock_index(struct ptp_clock *ptp) -{ return -1; } -static inline int ptp_find_pin(struct ptp_clock *ptp, - enum ptp_pin_function func, unsigned int chan) -{ return -1; } -static inline int ptp_schedule_worker(struct ptp_clock *ptp, - unsigned long delay) -{ return -EOPNOTSUPP; } -static inline void ptp_cancel_worker_sync(struct ptp_clock *ptp) -{ } static inline int ptp_get_vclocks_index(int pclock_index, int **vclock_index) { return 0; } static inline void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps, -- cgit v1.2.3 From 2c860a43dd77f969bb959336a2f743d7103a8f63 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 14 Aug 2021 10:57:15 +0900 Subject: bpf: af_unix: Implement BPF iterator for UNIX domain socket. This patch implements the BPF iterator for the UNIX domain socket. Currently, the batch optimisation introduced for the TCP iterator in the commit 04c7820b776f ("bpf: tcp: Bpf iter batching and lock_sock") is not used for the UNIX domain socket. It will require replacing the big lock for the hash table with small locks for each hash list not to block other processes. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210814015718.42704-2-kuniyu@amazon.co.jp --- include/linux/btf_ids.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 57890b357f85..bed4b9964581 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -172,7 +172,8 @@ extern struct btf_id_set name; BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, tcp_timewait_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, tcp6_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock) + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock) enum { #define BTF_SOCK_TYPE(name, str) name, -- cgit v1.2.3 From fb7dd8bca0139fd73d3f4a6cd257b11731317ded Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:54 -0700 Subject: bpf: Refactor BPF_PROG_RUN into a function Turn BPF_PROG_RUN into a proper always inlined function. No functional and performance changes are intended, but it makes it much easier to understand what's going on with how BPF programs are actually get executed. It's more obvious what types and callbacks are expected. Also extra () around input parameters can be dropped, as well as `__` variable prefixes intended to avoid naming collisions, which makes the code simpler to read and write. This refactoring also highlighted one extra issue. BPF_PROG_RUN is both a macro and an enum value (BPF_PROG_RUN == BPF_PROG_TEST_RUN). Turning BPF_PROG_RUN into a function causes naming conflict compilation error. So rename BPF_PROG_RUN into lower-case bpf_prog_run(), similar to bpf_prog_run_xdp(), bpf_prog_run_pin_on_cpu(), etc. All existing callers of BPF_PROG_RUN, the macro, are switched to bpf_prog_run() explicitly. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-2-andrii@kernel.org --- include/linux/bpf.h | 2 +- include/linux/filter.h | 61 ++++++++++++++++++++++++++++++-------------------- 2 files changed, 38 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c8cc09013210..968fea98087a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1103,7 +1103,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, /* an array of programs to be executed under rcu_lock. * * Typical usage: - * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, BPF_PROG_RUN); + * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, bpf_prog_run); * * the structure returned by bpf_prog_array_alloc() should be populated * with program pointers and the last pointer must be NULL. diff --git a/include/linux/filter.h b/include/linux/filter.h index 1797e8506929..954373db20e7 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -600,25 +600,38 @@ struct sk_filter { DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); -#define __BPF_PROG_RUN(prog, ctx, dfunc) ({ \ - u32 __ret; \ - cant_migrate(); \ - if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ - struct bpf_prog_stats *__stats; \ - u64 __start = sched_clock(); \ - __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ - __stats = this_cpu_ptr(prog->stats); \ - u64_stats_update_begin(&__stats->syncp); \ - __stats->cnt++; \ - __stats->nsecs += sched_clock() - __start; \ - u64_stats_update_end(&__stats->syncp); \ - } else { \ - __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ - } \ - __ret; }) - -#define BPF_PROG_RUN(prog, ctx) \ - __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func) +typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, + const struct bpf_insn *insnsi, + unsigned int (*bpf_func)(const void *, + const struct bpf_insn *)); + +static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, + const void *ctx, + bpf_dispatcher_fn dfunc) +{ + u32 ret; + + cant_migrate(); + if (static_branch_unlikely(&bpf_stats_enabled_key)) { + struct bpf_prog_stats *stats; + u64 start = sched_clock(); + + ret = dfunc(ctx, prog->insnsi, prog->bpf_func); + stats = this_cpu_ptr(prog->stats); + u64_stats_update_begin(&stats->syncp); + stats->cnt++; + stats->nsecs += sched_clock() - start; + u64_stats_update_end(&stats->syncp); + } else { + ret = dfunc(ctx, prog->insnsi, prog->bpf_func); + } + return ret; +} + +static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx) +{ + return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func); +} /* * Use in preemptible and therefore migratable context to make sure that @@ -637,7 +650,7 @@ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog, u32 ret; migrate_disable(); - ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func); + ret = bpf_prog_run(prog, ctx); migrate_enable(); return ret; } @@ -742,7 +755,7 @@ static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog, memset(cb_data, 0, sizeof(cb_saved)); } - res = BPF_PROG_RUN(prog, skb); + res = bpf_prog_run(prog, skb); if (unlikely(prog->cb_access)) memcpy(cb_data, cb_saved, sizeof(cb_saved)); @@ -787,7 +800,7 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, * under local_bh_disable(), which provides the needed RCU protection * for accessing map entries. */ - u32 act = __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); + u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) { if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev)) @@ -1440,7 +1453,7 @@ static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol, }; u32 act; - act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN); + act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run); if (act == SK_PASS) { selected_sk = ctx.selected_sk; no_reuseport = ctx.no_reuseport; @@ -1478,7 +1491,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, }; u32 act; - act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN); + act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run); if (act == SK_PASS) { selected_sk = ctx.selected_sk; no_reuseport = ctx.no_reuseport; -- cgit v1.2.3 From 7d08c2c9117113fee118487425ed55efa50cbfa9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:55 -0700 Subject: bpf: Refactor BPF_PROG_RUN_ARRAY family of macros into functions Similar to BPF_PROG_RUN, turn BPF_PROG_RUN_ARRAY macros into proper functions with all the same readability and maintainability benefits. Making them into functions required shuffling around bpf_set_run_ctx/bpf_reset_run_ctx functions. Also, explicitly specifying the type of the BPF prog run callback required adjusting __bpf_prog_run_save_cb() to accept const void *, casted internally to const struct sk_buff. Further, split out a cgroup-specific BPF_PROG_RUN_ARRAY_CG and BPF_PROG_RUN_ARRAY_CG_FLAGS from the more generic BPF_PROG_RUN_ARRAY due to the differences in bpf_run_ctx used for those two different use cases. I think BPF_PROG_RUN_ARRAY_CG would benefit from further refactoring to accept struct cgroup and enum bpf_attach_type instead of bpf_prog_array, fetching cgrp->bpf.effective[type] and RCU-dereferencing it internally. But that required including include/linux/cgroup-defs.h, which I wasn't sure is ok with everyone. The remaining generic BPF_PROG_RUN_ARRAY function will be extended to pass-through user-provided context value in the next patch. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-3-andrii@kernel.org --- include/linux/bpf.h | 179 ++++++++++++++++++++++++++++--------------------- include/linux/filter.h | 5 +- 2 files changed, 107 insertions(+), 77 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 968fea98087a..344e0d4d8ef6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1146,67 +1146,116 @@ struct bpf_run_ctx {}; struct bpf_cg_run_ctx { struct bpf_run_ctx run_ctx; - struct bpf_prog_array_item *prog_item; + const struct bpf_prog_array_item *prog_item; }; +static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) +{ + struct bpf_run_ctx *old_ctx = NULL; + +#ifdef CONFIG_BPF_SYSCALL + old_ctx = current->bpf_ctx; + current->bpf_ctx = new_ctx; +#endif + return old_ctx; +} + +static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) +{ +#ifdef CONFIG_BPF_SYSCALL + current->bpf_ctx = old_ctx; +#endif +} + /* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */ #define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE (1 << 0) /* BPF program asks to set CN on the packet. */ #define BPF_RET_SET_CN (1 << 0) -#define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags) \ - ({ \ - struct bpf_prog_array_item *_item; \ - struct bpf_prog *_prog; \ - struct bpf_prog_array *_array; \ - struct bpf_run_ctx *old_run_ctx; \ - struct bpf_cg_run_ctx run_ctx; \ - u32 _ret = 1; \ - u32 func_ret; \ - migrate_disable(); \ - rcu_read_lock(); \ - _array = rcu_dereference(array); \ - _item = &_array->items[0]; \ - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); \ - while ((_prog = READ_ONCE(_item->prog))) { \ - run_ctx.prog_item = _item; \ - func_ret = func(_prog, ctx); \ - _ret &= (func_ret & 1); \ - *(ret_flags) |= (func_ret >> 1); \ - _item++; \ - } \ - bpf_reset_run_ctx(old_run_ctx); \ - rcu_read_unlock(); \ - migrate_enable(); \ - _ret; \ - }) - -#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage) \ - ({ \ - struct bpf_prog_array_item *_item; \ - struct bpf_prog *_prog; \ - struct bpf_prog_array *_array; \ - struct bpf_run_ctx *old_run_ctx; \ - struct bpf_cg_run_ctx run_ctx; \ - u32 _ret = 1; \ - migrate_disable(); \ - rcu_read_lock(); \ - _array = rcu_dereference(array); \ - if (unlikely(check_non_null && !_array))\ - goto _out; \ - _item = &_array->items[0]; \ - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);\ - while ((_prog = READ_ONCE(_item->prog))) { \ - run_ctx.prog_item = _item; \ - _ret &= func(_prog, ctx); \ - _item++; \ - } \ - bpf_reset_run_ctx(old_run_ctx); \ -_out: \ - rcu_read_unlock(); \ - migrate_enable(); \ - _ret; \ - }) +typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx); + +static __always_inline u32 +BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, + const void *ctx, bpf_prog_run_fn run_prog, + u32 *ret_flags) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_cg_run_ctx run_ctx; + u32 ret = 1; + u32 func_ret; + + migrate_disable(); + rcu_read_lock(); + array = rcu_dereference(array_rcu); + item = &array->items[0]; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + while ((prog = READ_ONCE(item->prog))) { + run_ctx.prog_item = item; + func_ret = run_prog(prog, ctx); + ret &= (func_ret & 1); + *(ret_flags) |= (func_ret >> 1); + item++; + } + bpf_reset_run_ctx(old_run_ctx); + rcu_read_unlock(); + migrate_enable(); + return ret; +} + +static __always_inline u32 +BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, + const void *ctx, bpf_prog_run_fn run_prog) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_cg_run_ctx run_ctx; + u32 ret = 1; + + migrate_disable(); + rcu_read_lock(); + array = rcu_dereference(array_rcu); + item = &array->items[0]; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + while ((prog = READ_ONCE(item->prog))) { + run_ctx.prog_item = item; + ret &= run_prog(prog, ctx); + item++; + } + bpf_reset_run_ctx(old_run_ctx); + rcu_read_unlock(); + migrate_enable(); + return ret; +} + +static __always_inline u32 +BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, + const void *ctx, bpf_prog_run_fn run_prog) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + u32 ret = 1; + + migrate_disable(); + rcu_read_lock(); + array = rcu_dereference(array_rcu); + if (unlikely(!array)) + goto out; + item = &array->items[0]; + while ((prog = READ_ONCE(item->prog))) { + ret &= run_prog(prog, ctx); + item++; + } +out: + rcu_read_unlock(); + migrate_enable(); + return ret; +} /* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs * so BPF programs can request cwr for TCP packets. @@ -1235,7 +1284,7 @@ _out: \ u32 _flags = 0; \ bool _cn; \ u32 _ret; \ - _ret = BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, &_flags); \ + _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, &_flags); \ _cn = _flags & BPF_RET_SET_CN; \ if (_ret) \ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ @@ -1244,12 +1293,6 @@ _out: \ _ret; \ }) -#define BPF_PROG_RUN_ARRAY(array, ctx, func) \ - __BPF_PROG_RUN_ARRAY(array, ctx, func, false, true) - -#define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func) \ - __BPF_PROG_RUN_ARRAY(array, ctx, func, true, false) - #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); extern struct mutex bpf_stats_enabled_mutex; @@ -1284,20 +1327,6 @@ static inline void bpf_enable_instrumentation(void) migrate_enable(); } -static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) -{ - struct bpf_run_ctx *old_ctx; - - old_ctx = current->bpf_ctx; - current->bpf_ctx = new_ctx; - return old_ctx; -} - -static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) -{ - current->bpf_ctx = old_ctx; -} - extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; diff --git a/include/linux/filter.h b/include/linux/filter.h index 954373db20e7..7d248941ecea 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -723,7 +723,7 @@ static inline void bpf_restore_data_end( cb->data_end = saved_data_end; } -static inline u8 *bpf_skb_cb(struct sk_buff *skb) +static inline u8 *bpf_skb_cb(const struct sk_buff *skb) { /* eBPF programs may read/write skb->cb[] area to transfer meta * data between tail calls. Since this also needs to work with @@ -744,8 +744,9 @@ static inline u8 *bpf_skb_cb(struct sk_buff *skb) /* Must be invoked with migration disabled */ static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog, - struct sk_buff *skb) + const void *ctx) { + const struct sk_buff *skb = ctx; u8 *cb_data = bpf_skb_cb(skb); u8 cb_saved[BPF_SKB_CB_LEN]; u32 res; -- cgit v1.2.3 From b89fbfbb854c9afc3047e8273cc3a694650b802e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:57 -0700 Subject: bpf: Implement minimal BPF perf link Introduce a new type of BPF link - BPF perf link. This brings perf_event-based BPF program attachments (perf_event, tracepoints, kprobes, and uprobes) into the common BPF link infrastructure, allowing to list all active perf_event based attachments, auto-detaching BPF program from perf_event when link's FD is closed, get generic BPF link fdinfo/get_info functionality. BPF_LINK_CREATE command expects perf_event's FD as target_fd. No extra flags are currently supported. Force-detaching and atomic BPF program updates are not yet implemented, but with perf_event-based BPF links we now have common framework for this without the need to extend ioctl()-based perf_event interface. One interesting consideration is a new value for bpf_attach_type, which BPF_LINK_CREATE command expects. Generally, it's either 1-to-1 mapping from bpf_attach_type to bpf_prog_type, or many-to-1 mapping from a subset of bpf_attach_types to one bpf_prog_type (e.g., see BPF_PROG_TYPE_SK_SKB or BPF_PROG_TYPE_CGROUP_SOCK). In this case, though, we have three different program types (KPROBE, TRACEPOINT, PERF_EVENT) using the same perf_event-based mechanism, so it's many bpf_prog_types to one bpf_attach_type. I chose to define a single BPF_PERF_EVENT attach type for all of them and adjust link_create()'s logic for checking correspondence between attach type and program type. The alternative would be to define three new attach types (e.g., BPF_KPROBE, BPF_TRACEPOINT, and BPF_PERF_EVENT), but that seemed like unnecessary overkill and BPF_KPROBE will cause naming conflicts with BPF_KPROBE() macro, defined by libbpf. I chose to not do this to avoid unnecessary proliferation of bpf_attach_type enum values and not have to deal with naming conflicts. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20210815070609.987780-5-andrii@kernel.org --- include/linux/bpf_types.h | 3 +++ include/linux/trace_events.h | 3 +++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index ae3ac3a2018c..9c81724e4b98 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -136,3 +136,6 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter) BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns) BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp) #endif +#ifdef CONFIG_PERF_EVENTS +BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf) +#endif diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index ad413b382a3c..8ac92560d3a3 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -803,6 +803,9 @@ extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog); +void perf_event_free_bpf_prog(struct perf_event *event); + void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2); void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2, -- cgit v1.2.3 From 82e6b1eee6a8875ef4eacfd60711cce6965c6b04 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:58 -0700 Subject: bpf: Allow to specify user-provided bpf_cookie for BPF perf links Add ability for users to specify custom u64 value (bpf_cookie) when creating BPF link for perf_event-backed BPF programs (kprobe/uprobe, perf_event, tracepoints). This is useful for cases when the same BPF program is used for attaching and processing invocation of different tracepoints/kprobes/uprobes in a generic fashion, but such that each invocation is distinguished from each other (e.g., BPF program can look up additional information associated with a specific kernel function without having to rely on function IP lookups). This enables new use cases to be implemented simply and efficiently that previously were possible only through code generation (and thus multiple instances of almost identical BPF program) or compilation at runtime (BCC-style) on target hosts (even more expensive resource-wise). For uprobes it is not even possible in some cases to know function IP before hand (e.g., when attaching to shared library without PID filtering, in which case base load address is not known for a library). This is done by storing u64 bpf_cookie in struct bpf_prog_array_item, corresponding to each attached and run BPF program. Given cgroup BPF programs already use two 8-byte pointers for their needs and cgroup BPF programs don't have (yet?) support for bpf_cookie, reuse that space through union of cgroup_storage and new bpf_cookie field. Make it available to kprobe/tracepoint BPF programs through bpf_trace_run_ctx. This is set by BPF_PROG_RUN_ARRAY, used by kprobe/uprobe/tracepoint BPF program execution code, which luckily is now also split from BPF_PROG_RUN_ARRAY_CG. This run context will be utilized by a new BPF helper giving access to this user-provided cookie value from inside a BPF program. Generic perf_event BPF programs will access this value from perf_event itself through passed in BPF program context. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20210815070609.987780-6-andrii@kernel.org --- include/linux/bpf.h | 16 +++++++++++++++- include/linux/perf_event.h | 1 + include/linux/trace_events.h | 6 +++--- 3 files changed, 19 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 344e0d4d8ef6..83c3cc5e90df 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1114,7 +1114,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, */ struct bpf_prog_array_item { struct bpf_prog *prog; - struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; + union { + struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; + u64 bpf_cookie; + }; }; struct bpf_prog_array { @@ -1140,6 +1143,7 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array, int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, + u64 bpf_cookie, struct bpf_prog_array **new_array); struct bpf_run_ctx {}; @@ -1149,6 +1153,11 @@ struct bpf_cg_run_ctx { const struct bpf_prog_array_item *prog_item; }; +struct bpf_trace_run_ctx { + struct bpf_run_ctx run_ctx; + u64 bpf_cookie; +}; + static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) { struct bpf_run_ctx *old_ctx = NULL; @@ -1239,6 +1248,8 @@ BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, const struct bpf_prog_array_item *item; const struct bpf_prog *prog; const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_trace_run_ctx run_ctx; u32 ret = 1; migrate_disable(); @@ -1246,11 +1257,14 @@ BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, array = rcu_dereference(array_rcu); if (unlikely(!array)) goto out; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { + run_ctx.bpf_cookie = item->bpf_cookie; ret &= run_prog(prog, ctx); item++; } + bpf_reset_run_ctx(old_run_ctx); out: rcu_read_unlock(); migrate_enable(); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2d510ad750ed..fe156a8170aa 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -762,6 +762,7 @@ struct perf_event { #ifdef CONFIG_BPF_SYSCALL perf_overflow_handler_t orig_overflow_handler; struct bpf_prog *prog; + u64 bpf_cookie; #endif #ifdef CONFIG_EVENT_TRACING diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 8ac92560d3a3..8e0631a4b046 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -675,7 +675,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file) #ifdef CONFIG_BPF_EVENTS unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); -int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog); +int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); @@ -692,7 +692,7 @@ static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *c } static inline int -perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { return -EOPNOTSUPP; } @@ -803,7 +803,7 @@ extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog); +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_free_bpf_prog(struct perf_event *event); void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); -- cgit v1.2.3 From 7adfc6c9b315e174cf8743b21b7b691c8766791b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:59 -0700 Subject: bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value Add new BPF helper, bpf_get_attach_cookie(), which can be used by BPF programs to get access to a user-provided bpf_cookie value, specified during BPF program attachment (BPF link creation) time. Naming is hard, though. With the concept being named "BPF cookie", I've considered calling the helper: - bpf_get_cookie() -- seems too unspecific and easily mistaken with socket cookie; - bpf_get_bpf_cookie() -- too much tautology; - bpf_get_link_cookie() -- would be ok, but while we create a BPF link to attach BPF program to BPF hook, it's still an "attachment" and the bpf_cookie is associated with BPF program attachment to a hook, not a BPF link itself. Technically, we could support bpf_cookie with old-style cgroup programs.So I ultimately rejected it in favor of bpf_get_attach_cookie(). Currently all perf_event-backed BPF program types support bpf_get_attach_cookie() helper. Follow-up patches will add support for fentry/fexit programs as well. While at it, mark bpf_tracing_func_proto() as static to make it obvious that it's only used from within the kernel/trace/bpf_trace.c. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-7-andrii@kernel.org --- include/linux/bpf.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 83c3cc5e90df..f4c16f19f83e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2102,9 +2102,6 @@ extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; -const struct bpf_func_proto *bpf_tracing_func_proto( - enum bpf_func_id func_id, const struct bpf_prog *prog); - const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); -- cgit v1.2.3 From 994d2cbb08ca05e3c1af954ec63a3ae32a862ac5 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 17 Aug 2021 17:58:47 +0300 Subject: net: dsa: tag_sja1105: be dsa_loop-safe Add support for tag_sja1105 running on non-sja1105 DSA ports, by making sure that every time we dereference dp->priv, we check the switch's dsa_switch_ops (otherwise we access a struct sja1105_port structure that is in fact something else). This adds an unconditional build-time dependency between sja1105 being built as module => tag_sja1105 must also be built as module. This was there only for PTP before. Some sane defaults must also take place when not running on sja1105 hardware. These are: - sja1105_xmit_tpid: the sja1105 driver uses different VLAN protocols depending on VLAN awareness and switch revision (when an encapsulated VLAN must be sent). Default to 0x8100. - sja1105_rcv_meta_state_machine: this aggregates PTP frames with their metadata timestamp frames. When running on non-sja1105 hardware, don't do that and accept all frames unmodified. - sja1105_defer_xmit: calls sja1105_port_deferred_xmit in sja1105_main.c which writes a management route over SPI. When not running on sja1105 hardware, bypass the SPI write and send the frame as-is. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 0eadc7ac44ec..6b0dc9ff92d1 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -88,4 +88,22 @@ static inline void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, #endif /* IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) */ +#if IS_ENABLED(CONFIG_NET_DSA_SJA1105) + +extern const struct dsa_switch_ops sja1105_switch_ops; + +static inline bool dsa_port_is_sja1105(struct dsa_port *dp) +{ + return dp->ds->ops == &sja1105_switch_ops; +} + +#else + +static inline bool dsa_port_is_sja1105(struct dsa_port *dp) +{ + return false; +} + +#endif + #endif /* _NET_DSA_SJA1105_H */ -- cgit v1.2.3 From 4b1327be9fe57443295ae86fe0fcf24a18469e9f Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 17 Aug 2021 12:40:03 -0700 Subject: net-memcg: pass in gfp_t mask to mem_cgroup_charge_skmem() Add gfp_t mask as an input parameter to mem_cgroup_charge_skmem(), to give more control to the networking stack and enable it to change memcg charging behavior. In the future, the networking stack may decide to avoid oom-kills when fallbacks are more appropriate. One behavior change in mem_cgroup_charge_skmem() by this patch is to avoid force charging by default and let the caller decide when and if force charging is needed through the presence or absence of __GFP_NOFAIL. Signed-off-by: Wei Wang Reviewed-by: Shakeel Butt Signed-off-by: David S. Miller --- include/linux/memcontrol.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bfe5c486f4ad..f0ee30881ca9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1581,7 +1581,8 @@ static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb) #endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; -bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); +bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, + gfp_t gfp_mask); void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); #ifdef CONFIG_MEMCG extern struct static_key_false memcg_sockets_enabled_key; -- cgit v1.2.3 From 2274af1d60fee3fe35f341fc5d4dbf99ab78fb2f Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Wed, 18 Aug 2021 18:07:09 +0300 Subject: net: mii: make mii_ethtool_gset() return void mii_ethtool_gset() does not return any errors. Since there are no users of this function that rely on its return value, it can be made void. Signed-off-by: Pavel Skripkin Signed-off-by: David S. Miller --- include/linux/mii.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mii.h b/include/linux/mii.h index 219b93cad1dd..12ea29e04293 100644 --- a/include/linux/mii.h +++ b/include/linux/mii.h @@ -32,7 +32,7 @@ struct mii_if_info { extern int mii_link_ok (struct mii_if_info *mii); extern int mii_nway_restart (struct mii_if_info *mii); -extern int mii_ethtool_gset(struct mii_if_info *mii, struct ethtool_cmd *ecmd); +extern void mii_ethtool_gset(struct mii_if_info *mii, struct ethtool_cmd *ecmd); extern void mii_ethtool_get_link_ksettings( struct mii_if_info *mii, struct ethtool_link_ksettings *cmd); extern int mii_ethtool_sset(struct mii_if_info *mii, struct ethtool_cmd *ecmd); -- cgit v1.2.3 From 6e86a1543c378f2e8837ad88f361b7bf606c80f7 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 18 Aug 2021 09:12:32 +0200 Subject: can: dev: provide optional GPIO based termination support For CAN buses to work, a termination resistor has to be present at both ends of the bus. This resistor is usually 120 Ohms, other values may be required for special bus topologies. This patch adds support for a generic GPIO based CAN termination. The resistor value has to be specified via device tree, and it can only be attached to or detached from the bus. By default the termination is not active. Link: https://lore.kernel.org/r/20210818071232.20585-4-o.rempel@pengutronix.de Signed-off-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 27b275e463da..2413253e54c7 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -32,6 +32,12 @@ enum can_mode { CAN_MODE_SLEEP }; +enum can_termination_gpio { + CAN_TERMINATION_GPIO_DISABLED = 0, + CAN_TERMINATION_GPIO_ENABLED, + CAN_TERMINATION_GPIO_MAX, +}; + /* * CAN common private data */ @@ -55,6 +61,8 @@ struct can_priv { unsigned int termination_const_cnt; const u16 *termination_const; u16 termination; + struct gpio_desc *termination_gpio; + u16 termination_gpio_ohms[CAN_TERMINATION_GPIO_MAX]; enum can_state state; -- cgit v1.2.3 From 44779a4b85abd1d1dab9e5b90bd5e6adcfc8143a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 18 Aug 2021 16:52:16 -0700 Subject: bpf: Use kvmalloc for map keys in syscalls Same as previous patch but for the keys. memdup_bpfptr is renamed to kvmemdup_bpfptr (and converted to kvmalloc). Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210818235216.1159202-2-sdf@google.com --- include/linux/bpfptr.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h index 5cdeab497cb3..546e27fc6d46 100644 --- a/include/linux/bpfptr.h +++ b/include/linux/bpfptr.h @@ -62,9 +62,17 @@ static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset, return copy_to_sockptr_offset((sockptr_t) dst, offset, src, size); } -static inline void *memdup_bpfptr(bpfptr_t src, size_t len) +static inline void *kvmemdup_bpfptr(bpfptr_t src, size_t len) { - return memdup_sockptr((sockptr_t) src, len); + void *p = kvmalloc(len, GFP_USER | __GFP_NOWARN); + + if (!p) + return ERR_PTR(-ENOMEM); + if (copy_from_bpfptr(p, src, len)) { + kvfree(p); + return ERR_PTR(-EFAULT); + } + return p; } static inline long strncpy_from_bpfptr(char *dst, bpfptr_t src, size_t count) -- cgit v1.2.3 From 1ae258f8b343a0c4316c5545bfaf21010e4f0c73 Mon Sep 17 00:00:00 2001 From: Dmytro Linkin Date: Mon, 31 May 2021 17:08:14 +0300 Subject: net/mlx5: E-switch, Introduce rate limiting groups API Extend eswitch API with rate limiting groups: - Define new struct mlx5_esw_rate_group that is used to hold all internal group data. - Implement functions that allow creation, destruction and cleanup of groups. - Assign all vports to internal unlimited zero group by default. This commit lays the groundwork for group rate limiting by implementing devlink_ops->rate_node_{new|del}() callbacks to support creating and deleting groups through devlink rate node objects. APIs that allows setting rates and adding/removing members are implemented in following patches. Co-developed-by: Vlad Buslov Signed-off-by: Vlad Buslov Signed-off-by: Dmytro Linkin Reviewed-by: Huy Nguyen Reviewed-by: Mark Bloch Reviewed-by: Parav Pandit Reviewed-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index fce3cbae0b99..f3638d09ba77 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -865,7 +865,8 @@ struct mlx5_ifc_qos_cap_bits { u8 nic_bw_share[0x1]; u8 nic_rate_limit[0x1]; u8 packet_pacing_uid[0x1]; - u8 reserved_at_c[0x14]; + u8 log_esw_max_sched_depth[0x4]; + u8 reserved_at_10[0x10]; u8 reserved_at_20[0xb]; u8 log_max_qos_nic_queue_group[0x5]; -- cgit v1.2.3 From d2587c57ffd8dcad04171dfd203dcc4ff98e4782 Mon Sep 17 00:00:00 2001 From: Angus Ainslie Date: Thu, 12 Aug 2021 09:52:17 -0700 Subject: brcmfmac: add 43752 SDIO ids and initialization Add HW and SDIO ids for use with the SparkLan AP6275S Add the firmware mapping structures for the BRCM43752 chipset. The 43752 needs some things setup similar to the 43012 chipset. The WATERMARK shows better performance when initialized to the 4373 value. Signed-off-by: Angus Ainslie Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210812165218.2508258-2-angus@akkea.ca --- include/linux/mmc/sdio_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 12036619346c..a85c9f0bd470 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -75,6 +75,7 @@ #define SDIO_DEVICE_ID_BROADCOM_43364 0xa9a4 #define SDIO_DEVICE_ID_BROADCOM_43430 0xa9a6 #define SDIO_DEVICE_ID_BROADCOM_43455 0xa9bf +#define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752 0xaae8 #define SDIO_VENDOR_ID_MARVELL 0x02df #define SDIO_DEVICE_ID_MARVELL_LIBERTAS 0x9103 -- cgit v1.2.3 From 6fc88c354f3af83ffa2c285b86e76c759755693f Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 19 Aug 2021 02:24:20 -0700 Subject: bpf: Migrate cgroup_bpf to internal cgroup_bpf_attach_type enum Add an enum (cgroup_bpf_attach_type) containing only valid cgroup_bpf attach types and a function to map bpf_attach_type values to the new enum. Inspired by netns_bpf_attach_type. Then, migrate cgroup_bpf to use cgroup_bpf_attach_type wherever possible. Functionality is unchanged as attach_type_to_prog_type switches in bpf/syscall.c were preventing non-cgroup programs from making use of the invalid cgroup_bpf array slots. As a result struct cgroup_bpf uses 504 fewer bytes relative to when its arrays were sized using MAX_BPF_ATTACH_TYPE. bpf_cgroup_storage is notably not migrated as struct bpf_cgroup_storage_key is part of uapi and contains a bpf_attach_type member which is not meant to be opaque. Similarly, bpf_cgroup_link continues to report its bpf_attach_type member to userspace via fdinfo and bpf_link_info. To ease disambiguation, bpf_attach_type variables are renamed from 'type' to 'atype' when changed to cgroup_bpf_attach_type. Signed-off-by: Dave Marchevsky Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210819092420.1984861-2-davemarchevsky@fb.com --- include/linux/bpf-cgroup.h | 182 ++++++++++++++++++++++++++++++--------------- 1 file changed, 123 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a74cd1c3bd87..2746fd804216 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -23,9 +23,73 @@ struct ctl_table_header; struct task_struct; #ifdef CONFIG_CGROUP_BPF +enum cgroup_bpf_attach_type { + CGROUP_BPF_ATTACH_TYPE_INVALID = -1, + CGROUP_INET_INGRESS = 0, + CGROUP_INET_EGRESS, + CGROUP_INET_SOCK_CREATE, + CGROUP_SOCK_OPS, + CGROUP_DEVICE, + CGROUP_INET4_BIND, + CGROUP_INET6_BIND, + CGROUP_INET4_CONNECT, + CGROUP_INET6_CONNECT, + CGROUP_INET4_POST_BIND, + CGROUP_INET6_POST_BIND, + CGROUP_UDP4_SENDMSG, + CGROUP_UDP6_SENDMSG, + CGROUP_SYSCTL, + CGROUP_UDP4_RECVMSG, + CGROUP_UDP6_RECVMSG, + CGROUP_GETSOCKOPT, + CGROUP_SETSOCKOPT, + CGROUP_INET4_GETPEERNAME, + CGROUP_INET6_GETPEERNAME, + CGROUP_INET4_GETSOCKNAME, + CGROUP_INET6_GETSOCKNAME, + CGROUP_INET_SOCK_RELEASE, + MAX_CGROUP_BPF_ATTACH_TYPE +}; + +#define CGROUP_ATYPE(type) \ + case BPF_##type: return type + +static inline enum cgroup_bpf_attach_type +to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type) +{ + switch (attach_type) { + CGROUP_ATYPE(CGROUP_INET_INGRESS); + CGROUP_ATYPE(CGROUP_INET_EGRESS); + CGROUP_ATYPE(CGROUP_INET_SOCK_CREATE); + CGROUP_ATYPE(CGROUP_SOCK_OPS); + CGROUP_ATYPE(CGROUP_DEVICE); + CGROUP_ATYPE(CGROUP_INET4_BIND); + CGROUP_ATYPE(CGROUP_INET6_BIND); + CGROUP_ATYPE(CGROUP_INET4_CONNECT); + CGROUP_ATYPE(CGROUP_INET6_CONNECT); + CGROUP_ATYPE(CGROUP_INET4_POST_BIND); + CGROUP_ATYPE(CGROUP_INET6_POST_BIND); + CGROUP_ATYPE(CGROUP_UDP4_SENDMSG); + CGROUP_ATYPE(CGROUP_UDP6_SENDMSG); + CGROUP_ATYPE(CGROUP_SYSCTL); + CGROUP_ATYPE(CGROUP_UDP4_RECVMSG); + CGROUP_ATYPE(CGROUP_UDP6_RECVMSG); + CGROUP_ATYPE(CGROUP_GETSOCKOPT); + CGROUP_ATYPE(CGROUP_SETSOCKOPT); + CGROUP_ATYPE(CGROUP_INET4_GETPEERNAME); + CGROUP_ATYPE(CGROUP_INET6_GETPEERNAME); + CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME); + CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME); + CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE); + default: + return CGROUP_BPF_ATTACH_TYPE_INVALID; + } +} + +#undef CGROUP_ATYPE -extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE]; -#define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type]) +extern struct static_key_false cgroup_bpf_enabled_key[MAX_CGROUP_BPF_ATTACH_TYPE]; +#define cgroup_bpf_enabled(atype) static_branch_unlikely(&cgroup_bpf_enabled_key[atype]) #define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) @@ -67,15 +131,15 @@ struct bpf_prog_array; struct cgroup_bpf { /* array of effective progs in this cgroup */ - struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE]; + struct bpf_prog_array __rcu *effective[MAX_CGROUP_BPF_ATTACH_TYPE]; /* attached progs to this cgroup and attach flags * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will * have either zero or one element * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS */ - struct list_head progs[MAX_BPF_ATTACH_TYPE]; - u32 flags[MAX_BPF_ATTACH_TYPE]; + struct list_head progs[MAX_CGROUP_BPF_ATTACH_TYPE]; + u32 flags[MAX_CGROUP_BPF_ATTACH_TYPE]; /* list of cgroup shared storages */ struct list_head storages; @@ -115,28 +179,28 @@ int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sk(struct sock *sk, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, void *t_ctx, u32 *flags); int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype); int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, - short access, enum bpf_attach_type type); + short access, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, char **buf, size_t *pcount, loff_t *ppos, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, int *optname, char __user *optval, @@ -179,9 +243,9 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_INET_INGRESS)) \ + if (cgroup_bpf_enabled(CGROUP_INET_INGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ - BPF_CGROUP_INET_INGRESS); \ + CGROUP_INET_INGRESS); \ \ __ret; \ }) @@ -189,54 +253,54 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ + if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ if (sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ - BPF_CGROUP_INET_EGRESS); \ + CGROUP_INET_EGRESS); \ } \ __ret; \ }) -#define BPF_CGROUP_RUN_SK_PROG(sk, type) \ +#define BPF_CGROUP_RUN_SK_PROG(sk, atype) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) { \ - __ret = __cgroup_bpf_run_filter_sk(sk, type); \ + if (cgroup_bpf_enabled(atype)) { \ + __ret = __cgroup_bpf_run_filter_sk(sk, atype); \ } \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_CREATE) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_RELEASE) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_RELEASE) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET4_POST_BIND) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET6_POST_BIND) -#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ +#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) \ ({ \ u32 __unused_flags; \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + if (cgroup_bpf_enabled(atype)) \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ NULL, \ &__unused_flags); \ __ret; \ }) -#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) \ ({ \ u32 __unused_flags; \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) { \ + if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ t_ctx, \ &__unused_flags); \ release_sock(sk); \ @@ -249,13 +313,13 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE). */ -#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, bind_flags) \ +#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, bind_flags) \ ({ \ u32 __flags = 0; \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) { \ + if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ NULL, &__flags); \ release_sock(sk); \ if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE) \ @@ -265,33 +329,33 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) \ - ((cgroup_bpf_enabled(BPF_CGROUP_INET4_CONNECT) || \ - cgroup_bpf_enabled(BPF_CGROUP_INET6_CONNECT)) && \ + ((cgroup_bpf_enabled(CGROUP_INET4_CONNECT) || \ + cgroup_bpf_enabled(CGROUP_INET6_CONNECT)) && \ (sk)->sk_prot->pre_connect) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET4_CONNECT) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET6_CONNECT) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET4_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET6_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_RECVMSG, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_RECVMSG, NULL) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_RECVMSG, NULL) /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a * fullsock and its parent fullsock cannot be traced by @@ -311,33 +375,33 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS)) \ + if (cgroup_bpf_enabled(CGROUP_SOCK_OPS)) \ __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ sock_ops, \ - BPF_CGROUP_SOCK_OPS); \ + CGROUP_SOCK_OPS); \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS) && (sock_ops)->sk) { \ + if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) && (sock_ops)->sk) { \ typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ if (__sk && sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ sock_ops, \ - BPF_CGROUP_SOCK_OPS); \ + CGROUP_SOCK_OPS); \ } \ __ret; \ }) -#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access) \ +#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_DEVICE)) \ - __ret = __cgroup_bpf_check_dev_permission(type, major, minor, \ + if (cgroup_bpf_enabled(CGROUP_DEVICE)) \ + __ret = __cgroup_bpf_check_dev_permission(atype, major, minor, \ access, \ - BPF_CGROUP_DEVICE); \ + CGROUP_DEVICE); \ \ __ret; \ }) @@ -346,10 +410,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SYSCTL)) \ + if (cgroup_bpf_enabled(CGROUP_SYSCTL)) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ buf, count, pos, \ - BPF_CGROUP_SYSCTL); \ + CGROUP_SYSCTL); \ __ret; \ }) @@ -357,7 +421,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, kernel_optval) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ optname, optval, \ optlen, \ @@ -368,7 +432,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ get_user(__ret, optlen); \ __ret; \ }) @@ -377,7 +441,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, max_optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ if (!(sock)->sk_prot->bpf_bypass_getsockopt || \ !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ tcp_bpf_bypass_getsockopt, \ @@ -392,7 +456,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_getsockopt_kern( \ sock, level, optname, optval, optlen, retval); \ __ret; \ @@ -451,14 +515,14 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, return 0; } -#define cgroup_bpf_enabled(type) (0) -#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; }) +#define cgroup_bpf_enabled(atype) (0) +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, flags) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, flags) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) @@ -470,7 +534,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; }) #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ -- cgit v1.2.3 From dab2ea6c680f87add6d2f7007ce46b6b9e3857f7 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 23 Aug 2021 20:02:38 +0200 Subject: ieee80211: add TWT element definitions Introduce TWT definitions and TWT Information element structure in ieee80211.h Tested-by: Peter Chiu Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/71d8b581fe4b5abc5b92f8d77ac2de3e2f7591b6.1629741512.git.lorenzo@kernel.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 62 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index a6730072d13a..2e8953d80d4b 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1088,6 +1088,48 @@ struct ieee80211_ext { } u; } __packed __aligned(2); +#define IEEE80211_TWT_CONTROL_NDP BIT(0) +#define IEEE80211_TWT_CONTROL_RESP_MODE BIT(1) +#define IEEE80211_TWT_CONTROL_NEG_TYPE_BROADCAST BIT(3) +#define IEEE80211_TWT_CONTROL_RX_DISABLED BIT(4) +#define IEEE80211_TWT_CONTROL_WAKE_DUR_UNIT BIT(5) + +#define IEEE80211_TWT_REQTYPE_REQUEST BIT(0) +#define IEEE80211_TWT_REQTYPE_SETUP_CMD GENMASK(3, 1) +#define IEEE80211_TWT_REQTYPE_TRIGGER BIT(4) +#define IEEE80211_TWT_REQTYPE_IMPLICIT BIT(5) +#define IEEE80211_TWT_REQTYPE_FLOWTYPE BIT(6) +#define IEEE80211_TWT_REQTYPE_FLOWID GENMASK(9, 7) +#define IEEE80211_TWT_REQTYPE_WAKE_INT_EXP GENMASK(14, 10) +#define IEEE80211_TWT_REQTYPE_PROTECTION BIT(15) + +enum ieee80211_twt_setup_cmd { + TWT_SETUP_CMD_REQUEST, + TWT_SETUP_CMD_SUGGEST, + TWT_SETUP_CMD_DEMAND, + TWT_SETUP_CMD_GROUPING, + TWT_SETUP_CMD_ACCEPT, + TWT_SETUP_CMD_ALTERNATE, + TWT_SETUP_CMD_DICTATE, + TWT_SETUP_CMD_REJECT, +}; + +struct ieee80211_twt_params { + __le16 req_type; + __le64 twt; + u8 min_twt_dur; + __le16 mantissa; + u8 channel; +} __packed; + +struct ieee80211_twt_setup { + u8 dialog_token; + u8 element_id; + u8 length; + u8 control; + u8 params[]; +} __packed; + struct ieee80211_mgmt { __le16 frame_control; __le16 duration; @@ -1252,6 +1294,10 @@ struct ieee80211_mgmt { __le16 toa_error; u8 variable[0]; } __packed ftm; + struct { + u8 action_code; + u8 variable[]; + } __packed s1g; } u; } __packed action; } u; @@ -2881,6 +2927,7 @@ enum ieee80211_eid { WLAN_EID_AID_RESPONSE = 211, WLAN_EID_S1G_BCN_COMPAT = 213, WLAN_EID_S1G_SHORT_BCN_INTERVAL = 214, + WLAN_EID_S1G_TWT = 216, WLAN_EID_S1G_CAPABILITIES = 217, WLAN_EID_VENDOR_SPECIFIC = 221, WLAN_EID_QOS_PARAMETER = 222, @@ -2950,6 +2997,7 @@ enum ieee80211_category { WLAN_CATEGORY_FST = 18, WLAN_CATEGORY_UNPROT_DMG = 20, WLAN_CATEGORY_VHT = 21, + WLAN_CATEGORY_S1G = 22, WLAN_CATEGORY_VENDOR_SPECIFIC_PROTECTED = 126, WLAN_CATEGORY_VENDOR_SPECIFIC = 127, }; @@ -3023,6 +3071,20 @@ enum ieee80211_key_len { WLAN_KEY_LEN_BIP_GMAC_256 = 32, }; +enum ieee80211_s1g_actioncode { + WLAN_S1G_AID_SWITCH_REQUEST, + WLAN_S1G_AID_SWITCH_RESPONSE, + WLAN_S1G_SYNC_CONTROL, + WLAN_S1G_STA_INFO_ANNOUNCE, + WLAN_S1G_EDCA_PARAM_SET, + WLAN_S1G_EL_OPERATION, + WLAN_S1G_TWT_SETUP, + WLAN_S1G_TWT_TEARDOWN, + WLAN_S1G_SECT_GROUP_ID_LIST, + WLAN_S1G_SECT_ID_FEEDBACK, + WLAN_S1G_TWT_INFORMATION = 11, +}; + #define IEEE80211_WEP_IV_LEN 4 #define IEEE80211_WEP_ICV_LEN 4 #define IEEE80211_CCMP_HDR_LEN 8 -- cgit v1.2.3 From 95d1d2490c278ea316a4350f4c24818275fb989c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 23 Aug 2021 11:01:35 -0700 Subject: netdevice: move xdp_rxq within netdev_rx_queue Both struct netdev_rx_queue and struct xdp_rxq_info are cacheline aligned. This causes extra padding before and after the xdp_rxq member. Move the member upfront, so that it's naturally aligned. Before: /* size: 256, cachelines: 4, members: 6 */ /* sum members: 160, holes: 1, sum holes: 40 */ /* padding: 56 */ /* paddings: 1, sum paddings: 36 */ /* forced alignments: 1, forced holes: 1, sum forced holes: 40 */ After: /* size: 192, cachelines: 3, members: 6 */ /* padding: 32 */ /* paddings: 1, sum paddings: 36 */ /* forced alignments: 1 */ Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20210823180135.1153608-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2f03cd9e371a..b88ad5aef7fe 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -722,13 +722,13 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, /* This structure contains an instance of an RX queue. */ struct netdev_rx_queue { + struct xdp_rxq_info xdp_rxq; #ifdef CONFIG_RPS struct rps_map __rcu *rps_map; struct rps_dev_flow_table __rcu *rps_flow_table; #endif struct kobject kobj; struct net_device *dev; - struct xdp_rxq_info xdp_rxq; #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif -- cgit v1.2.3 From 029ee6b14356b94120bedb852dcdaefc0a282cf1 Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Fri, 20 Aug 2021 15:35:17 +0800 Subject: ethtool: add two coalesce attributes for CQE mode Currently, there are many drivers who support CQE mode configuration, some configure it as a fixed when initialized, some provide an interface to change it by ethtool private flags. In order to make it more generic, add two new 'ETHTOOL_A_COALESCE_USE_CQE_TX' and 'ETHTOOL_A_COALESCE_USE_CQE_RX' coalesce attributes, then these parameters can be accessed by ethtool netlink coalesce uAPI. Also add an new structure kernel_ethtool_coalesce, then the new parameter can be added into this struct. Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 4711b96dae0c..a9d77a6a3e00 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -172,6 +172,11 @@ extern int __ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings); +struct kernel_ethtool_coalesce { + u8 use_cqe_mode_tx; + u8 use_cqe_mode_rx; +}; + /** * ethtool_intersect_link_masks - Given two link masks, AND them together * @dst: first mask and where result is stored @@ -211,7 +216,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, #define ETHTOOL_COALESCE_TX_USECS_HIGH BIT(19) #define ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH BIT(20) #define ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL BIT(21) -#define ETHTOOL_COALESCE_ALL_PARAMS GENMASK(21, 0) +#define ETHTOOL_COALESCE_USE_CQE_RX BIT(22) +#define ETHTOOL_COALESCE_USE_CQE_TX BIT(23) +#define ETHTOOL_COALESCE_ALL_PARAMS GENMASK(23, 0) #define ETHTOOL_COALESCE_USECS \ (ETHTOOL_COALESCE_RX_USECS | ETHTOOL_COALESCE_TX_USECS) @@ -237,6 +244,8 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, ETHTOOL_COALESCE_RX_USECS_LOW | ETHTOOL_COALESCE_RX_USECS_HIGH | \ ETHTOOL_COALESCE_PKT_RATE_LOW | ETHTOOL_COALESCE_PKT_RATE_HIGH | \ ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL) +#define ETHTOOL_COALESCE_USE_CQE \ + (ETHTOOL_COALESCE_USE_CQE_RX | ETHTOOL_COALESCE_USE_CQE_TX) #define ETHTOOL_STAT_NOT_SET (~0ULL) -- cgit v1.2.3 From f3ccfda1931977b80267ba54070a1aeafa18f6ca Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Fri, 20 Aug 2021 15:35:18 +0800 Subject: ethtool: extend coalesce setting uAPI with CQE mode In order to support more coalesce parameters through netlink, add two new parameter kernel_coal and extack for .set_coalesce and .get_coalesce, then some extra info can return to user with the netlink API. Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index a9d77a6a3e00..849524b55d89 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -15,6 +15,7 @@ #include #include +#include #include struct compat_ethtool_rx_flow_spec { @@ -611,8 +612,14 @@ struct ethtool_ops { struct ethtool_eeprom *, u8 *); int (*set_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); - int (*get_coalesce)(struct net_device *, struct ethtool_coalesce *); - int (*set_coalesce)(struct net_device *, struct ethtool_coalesce *); + int (*get_coalesce)(struct net_device *, + struct ethtool_coalesce *, + struct kernel_ethtool_coalesce *, + struct netlink_ext_ack *); + int (*set_coalesce)(struct net_device *, + struct ethtool_coalesce *, + struct kernel_ethtool_coalesce *, + struct netlink_ext_ack *); void (*get_ringparam)(struct net_device *, struct ethtool_ringparam *); int (*set_ringparam)(struct net_device *, -- cgit v1.2.3 From 1d71eb53e45187f58089d32b51e27784c791d90e Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 26 Jul 2021 20:36:54 -0700 Subject: Revert "PCI: Make pci_enable_ptm() private" Make pci_enable_ptm() accessible from the drivers. Exposing this to the driver enables the driver to use the 'ptm_enabled' field of 'pci_dev' to check if PTM is enabled or not. This reverts commit ac6c26da29c1 ("PCI: Make pci_enable_ptm() private"). Signed-off-by: Vinicius Costa Gomes Acked-by: Bjorn Helgaas Signed-off-by: Tony Nguyen --- include/linux/pci.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 540b377ca8f6..21a9d244e4e4 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1620,6 +1620,13 @@ static inline bool pci_aer_available(void) { return false; } bool pci_ats_disabled(void); +#ifdef CONFIG_PCIE_PTM +int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); +#else +static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) +{ return -EINVAL; } +#endif + void pci_cfg_access_lock(struct pci_dev *dev); bool pci_cfg_access_trylock(struct pci_dev *dev); void pci_cfg_access_unlock(struct pci_dev *dev); -- cgit v1.2.3 From 014408cd624e9fd2820f4a593b710325ee05fec9 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 26 Jul 2021 20:36:55 -0700 Subject: PCI: Add pcie_ptm_enabled() Add a predicate that returns if PCIe PTM (Precision Time Measurement) is enabled. It will only return true if it's enabled in all the ports in the path from the device to the root. Signed-off-by: Vinicius Costa Gomes Acked-by: Bjorn Helgaas Signed-off-by: Tony Nguyen --- include/linux/pci.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 21a9d244e4e4..947430637cac 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1622,9 +1622,12 @@ bool pci_ats_disabled(void); #ifdef CONFIG_PCIE_PTM int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); +bool pcie_ptm_enabled(struct pci_dev *dev); #else static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) { return -EINVAL; } +static inline bool pcie_ptm_enabled(struct pci_dev *dev) +{ return false; } #endif void pci_cfg_access_lock(struct pci_dev *dev); -- cgit v1.2.3 From 406f42fa0d3cbcea3766c3111d79ac5afe711c5b Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 19 Aug 2021 10:17:27 +0300 Subject: net-next: When a bond have a massive amount of VLANs with IPv6 addresses, performance of changing link state, attaching a VRF, changing an IPv6 address, etc. go down dramtically. The source of most of the slow down is the `dev_addr_lists.c` module, which mainatins a linked list of HW addresses. When using IPv6, this list grows for each IPv6 address added on a VLAN, since each IPv6 address has a multicast HW address associated with it. When performing any modification to the involved links, this list is traversed many times, often for nothing, all while holding the RTNL lock. Instead, this patch adds an auxilliary rbtree which cuts down traversal time significantly. Performance can be seen with the following script: #!/bin/bash ip netns del test || true 2>/dev/null ip netns add test echo 1 | ip netns exec test tee /proc/sys/net/ipv6/conf/all/keep_addr_on_down > /dev/null set -e ip -n test link add foo type veth peer name bar ip -n test link add b1 type bond ip -n test link add florp type vrf table 10 ip -n test link set bar master b1 ip -n test link set foo up ip -n test link set bar up ip -n test link set b1 up ip -n test link set florp up VLAN_COUNT=1500 BASE_DEV=b1 echo Creating vlans ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT); do ip -n test link add link $BASE_DEV name foo.\$i type vlan id \$i; done" echo Bringing them up ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT); do ip -n test link set foo.\$i up; done" echo Assiging IPv6 Addresses ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT); do ip -n test address add dev foo.\$i 2000::\$i/64; done" echo Attaching to VRF ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT); do ip -n test link set foo.\$i master florp; done" On an Intel(R) Xeon(R) CPU E5-2650 v3 @ 2.30GHz machine, the performance before the patch is (truncated): Creating vlans real 108.35 Bringing them up real 4.96 Assiging IPv6 Addresses real 19.22 Attaching to VRF real 458.84 After the patch: Creating vlans real 5.59 Bringing them up real 5.07 Assiging IPv6 Addresses real 5.64 Attaching to VRF real 25.37 Cc: David S. Miller Cc: Jakub Kicinski Cc: Lu Wei Cc: Xiongfeng Wang Cc: Taehee Yoo Signed-off-by: Gilad Naaman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b88ad5aef7fe..6fd3a4d42668 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -47,6 +47,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -208,6 +209,7 @@ struct sk_buff; struct netdev_hw_addr { struct list_head list; + struct rb_node node; unsigned char addr[MAX_ADDR_LEN]; unsigned char type; #define NETDEV_HW_ADDR_T_LAN 1 @@ -224,6 +226,9 @@ struct netdev_hw_addr { struct netdev_hw_addr_list { struct list_head list; int count; + + /* Auxiliary tree for faster lookup on addition and deletion */ + struct rb_root tree; }; #define netdev_hw_addr_list_count(l) ((l)->count) -- cgit v1.2.3 From b0b8c67eaa5c65f8426017e78fcce12dc7d85110 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 24 Aug 2021 20:15:01 +0300 Subject: net: dsa: sja1105: drop untagged packets on the CPU and DSA ports The sja1105 driver is a bit special in its use of VLAN headers as DSA tags. This is because in VLAN-aware mode, the VLAN headers use an actual TPID of 0x8100, which is understood even by the DSA master as an actual VLAN header. Furthermore, control packets such as PTP and STP are transmitted with no VLAN header as a DSA tag, because, depending on switch generation, there are ways to steer these control packets towards a precise egress port other than VLAN tags. Transmitting control packets as untagged means leaving a door open for traffic in general to be transmitted as untagged from the DSA master, and for it to traverse the switch and exit a random switch port according to the FDB lookup. This behavior is a bit out of line with other DSA drivers which have native support for DSA tagging. There, it is to be expected that the switch only accepts DSA-tagged packets on its CPU port, dropping everything that does not match this pattern. We perhaps rely a bit too much on the switches' hardware dropping on the CPU port, and place no other restrictions in the kernel data path to avoid that. For example, sja1105 is also a bit special in that STP/PTP packets are transmitted using "management routes" (sja1105_port_deferred_xmit): when sending a link-local packet from the CPU, we must first write a SPI message to the switch to tell it to expect a packet towards multicast MAC DA 01-80-c2-00-00-0e, and to route it towards port 3 when it gets it. This entry expires as soon as it matches a packet received by the switch, and it needs to be reinstalled for the next packet etc. All in all quite a ghetto mechanism, but it is all that the sja1105 switches offer for injecting a control packet. The driver takes a mutex for serializing control packets and making the pairs of SPI writes of a management route and its associated skb atomic, but to be honest, a mutex is only relevant as long as all parties agree to take it. With the DSA design, it is possible to open an AF_PACKET socket on the DSA master net device, and blast packets towards 01-80-c2-00-00-0e, and whatever locking the DSA switch driver might use, it all goes kaput because management routes installed by the driver will match skbs sent by the DSA master, and not skbs generated by the driver itself. So they will end up being routed on the wrong port. So through the lens of that, maybe it would make sense to avoid that from happening by doing something in the network stack, like: introduce a new bit in struct sk_buff, like xmit_from_dsa. Then, somewhere around dev_hard_start_xmit(), introduce the following check: if (netdev_uses_dsa(dev) && !skb->xmit_from_dsa) kfree_skb(skb); Ok, maybe that is a bit drastic, but that would at least prevent a bunch of problems. For example, right now, even though the majority of DSA switches drop packets without DSA tags sent by the DSA master (and therefore the majority of garbage that user space daemons like avahi and udhcpcd and friends create), it is still conceivable that an aggressive user space program can open an AF_PACKET socket and inject a spoofed DSA tag directly on the DSA master. We have no protection against that; the packet will be understood by the switch and be routed wherever user space says. Furthermore: there are some DSA switches where we even have register access over Ethernet, using DSA tags. So even user space drivers are possible in this way. This is a huge hole. However, the biggest thing that bothers me is that udhcpcd attempts to ask for an IP address on all interfaces by default, and with sja1105, it will attempt to get a valid IP address on both the DSA master as well as on sja1105 switch ports themselves. So with IP addresses in the same subnet on multiple interfaces, the routing table will be messed up and the system will be unusable for traffic until it is configured manually to not ask for an IP address on the DSA master itself. It turns out that it is possible to avoid that in the sja1105 driver, at least very superficially, by requesting the switch to drop VLAN-untagged packets on the CPU port. With the exception of control packets, all traffic originated from tag_sja1105.c is already VLAN-tagged, so only STP and PTP packets need to be converted. For that, we need to uphold the equivalence between an untagged and a pvid-tagged packet, and to remember that the CPU port of sja1105 uses a pvid of 4095. Now that we drop untagged traffic on the CPU port, non-aggressive user space applications like udhcpcd stop bothering us, and sja1105 effectively becomes just as vulnerable to the aggressive kind of user space programs as other DSA switches are (ok, users can also create 8021q uppers on top of the DSA master in the case of sja1105, but in future patches we can easily deny that, but it still doesn't change the fact that VLAN-tagged packets can still be injected over raw sockets). Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 6b0dc9ff92d1..8c5601f1c979 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -16,6 +16,8 @@ #define ETH_P_SJA1105_META 0x0008 #define ETH_P_SJA1110 0xdadc +#define SJA1105_DEFAULT_VLAN (VLAN_N_VID - 1) + /* IEEE 802.3 Annex 57A: Slow Protocols PDUs (01:80:C2:xx:xx:xx) */ #define SJA1105_LINKLOCAL_FILTER_A 0x0180C2000000ull #define SJA1105_LINKLOCAL_FILTER_A_MASK 0xFFFFFF000000ull -- cgit v1.2.3 From 8ded9160928e545c2e694b77a87263fa078ff4c6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 24 Aug 2021 20:15:02 +0300 Subject: net: dsa: tag_sja1105: stop asking the sja1105 driver in sja1105_xmit_tpid Introduced in commit 38b5beeae7a4 ("net: dsa: sja1105: prepare tagger for handling DSA tags and VLAN simultaneously"), the sja1105_xmit_tpid function solved quite a different problem than our needs are now. Then, we used best-effort VLAN filtering and we were using the xmit_tpid to tunnel packets coming from an 8021q upper through the TX VLAN allocated by tag_8021q to that egress port. The need for a different VLAN protocol depending on switch revision came from the fact that this in itself was more of a hack to trick the hardware into accepting tunneled VLANs in the first place. Right now, we deny 8021q uppers (see sja1105_prechangeupper). Even if we supported them again, we would not do that using the same method of {tunneling the VLAN on egress, retagging the VLAN on ingress} that we had in the best-effort VLAN filtering mode. It seems rather simpler that we just allocate a VLAN in the VLAN table that is simply not used by the bridge at all, or by any other port. Anyway, I have 2 gripes with the current sja1105_xmit_tpid: 1. When sending packets on behalf of a VLAN-aware bridge (with the new TX forwarding offload framework) plus untagged (with the tag_8021q VLAN added by the tagger) packets, we can see that on SJA1105P/Q/R/S and later (which have a qinq_tpid of ETH_P_8021AD), some packets sent through the DSA master have a VLAN protocol of 0x8100 and others of 0x88a8. This is strange and there is no reason for it now. If we have a bridge and are therefore forced to send using that bridge's TPID, we can as well blend with that bridge's VLAN protocol for all packets. 2. The sja1105_xmit_tpid introduces a dependency on the sja1105 driver, because it looks inside dp->priv. It is desirable to keep as much separation between taggers and switch drivers as possible. Now it doesn't do that anymore. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 8c5601f1c979..171106202fe5 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -67,7 +67,6 @@ struct sja1105_port { struct sja1105_tagger_data *data; struct dsa_port *dp; bool hwts_tx_en; - u16 xmit_tpid; }; enum sja1110_meta_tstamp { -- cgit v1.2.3 From 1b07d00a15d6a96d1a36b6a284c4fd5f2e2fa383 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 23 Aug 2021 19:43:46 -0700 Subject: bpf: Add BTF_ID_LIST_GLOBAL_SINGLE macro Same as BTF_ID_LIST_SINGLE macro except defines a global ID. Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/a867a97517df42fd3953eeb5454402b57e74538f.1629772842.git.dxu@dxuuu.xyz --- include/linux/btf_ids.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index bed4b9964581..6d1395030616 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -82,6 +82,9 @@ __BTF_ID_LIST(name, globl) #define BTF_ID_LIST_SINGLE(name, prefix, typename) \ BTF_ID_LIST(name) \ BTF_ID(prefix, typename) +#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) \ + BTF_ID_LIST_GLOBAL(name) \ + BTF_ID(prefix, typename) /* * The BTF_ID_UNUSED macro defines 4 zero bytes. -- cgit v1.2.3 From 33c5cb36015ac1034b50b823fae367e908d05147 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 23 Aug 2021 19:43:47 -0700 Subject: bpf: Consolidate task_struct BTF_ID declarations No need to have it defined 5 times. Once is enough. Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/6dcefa5bed26fe1226f26683f36819bb53ec19a2.1629772842.git.dxu@dxuuu.xyz --- include/linux/btf_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 6d1395030616..93d881ab0d48 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -188,4 +188,6 @@ MAX_BTF_SOCK_TYPE, extern u32 btf_sock_ids[]; #endif +extern u32 btf_task_struct_ids[]; + #endif -- cgit v1.2.3 From eb529c5b10b9401a0f2d1f469e82c6a0ba98082c Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 25 Aug 2021 18:48:31 -0700 Subject: bpf: Fix bpf-next builds without CONFIG_BPF_EVENTS This commit fixes linker errors along the lines of: s390-linux-ld: task_iter.c:(.init.text+0xa4): undefined reference to `btf_task_struct_ids'` Fix by defining btf_task_struct_ids unconditionally in kernel/bpf/btf.c since there exists code that unconditionally uses btf_task_struct_ids. Reported-by: kernel test robot Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/05d94748d9f4b3eecedc4fddd6875418a396e23c.1629942444.git.dxu@dxuuu.xyz --- include/linux/btf_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 93d881ab0d48..47d9abfbdb55 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -151,6 +151,7 @@ extern struct btf_id_set name; #define BTF_ID_UNUSED #define BTF_ID_LIST_GLOBAL(name) u32 name[1]; #define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; +#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 name[1]; #define BTF_SET_START(name) static struct btf_id_set name = { 0 }; #define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; #define BTF_SET_END(name) -- cgit v1.2.3 From a1ef61825469b874920f4afb889e1a92353680ff Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Fri, 20 Aug 2021 08:20:35 -0400 Subject: ieee80211: add definition of regulatory info in 6 GHz operation information MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IEEE Std 802.11ax™-2021 added regulatory info subfield in HE operation element, add it to the header file. Signed-off-by: Wen Gong Link: https://lore.kernel.org/r/20210820122041.12157-3-wgong@codeaurora.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 2e8953d80d4b..f91cb15a74e7 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2312,6 +2312,9 @@ ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info) #define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR 0x40000000 #define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED 0x80000000 +#define IEEE80211_6GHZ_CTRL_REG_LPI_AP 0 +#define IEEE80211_6GHZ_CTRL_REG_SP_AP 1 + /** * ieee80211_he_6ghz_oper - HE 6 GHz operation Information field * @primary: primary channel @@ -2328,6 +2331,7 @@ struct ieee80211_he_6ghz_oper { #define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ 2 #define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ 3 #define IEEE80211_HE_6GHZ_OPER_CTRL_DUP_BEACON 0x4 +#define IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO 0x38 u8 control; u8 ccfs0; u8 ccfs1; -- cgit v1.2.3 From ad31393b98e4addbc5f1ccc484bfbb8d07c92056 Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Fri, 20 Aug 2021 08:20:39 -0400 Subject: ieee80211: add definition for transmit power envelope element MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IEEE Std 802.11ax™-2021 makes changes to the transmit power envelope element, adjust the code accordingly. Signed-off-by: Wen Gong Link: https://lore.kernel.org/r/20210820122041.12157-7-wgong@codeaurora.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index f91cb15a74e7..694264503119 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2338,6 +2338,44 @@ struct ieee80211_he_6ghz_oper { u8 minrate; } __packed; +/* + * In "9.4.2.161 Transmit Power Envelope element" of "IEEE Std 802.11ax-2021", + * it show four types in "Table 9-275a-Maximum Transmit Power Interpretation + * subfield encoding", and two category for each type in "Table E-12-Regulatory + * Info subfield encoding in the United States". + * So it it totally max 8 Transmit Power Envelope element. + */ +#define IEEE80211_TPE_MAX_IE_COUNT 8 +/* + * In "Table 9-277—Meaning of Maximum Transmit Power Count subfield" + * of "IEEE Std 802.11ax™‐2021", the max power level is 8. + */ +#define IEEE80211_MAX_NUM_PWR_LEVEL 8 + +#define IEEE80211_TPE_MAX_POWER_COUNT 8 + +/* transmit power interpretation type of transmit power envelope element */ +enum ieee80211_tx_power_intrpt_type { + IEEE80211_TPE_LOCAL_EIRP, + IEEE80211_TPE_LOCAL_EIRP_PSD, + IEEE80211_TPE_REG_CLIENT_EIRP, + IEEE80211_TPE_REG_CLIENT_EIRP_PSD, +}; + +/** + * struct ieee80211_tx_pwr_env + * + * This structure represents the "Transmit Power Envelope element" + */ +struct ieee80211_tx_pwr_env { + u8 tx_power_info; + s8 tx_power[IEEE80211_TPE_MAX_POWER_COUNT]; +} __packed; + +#define IEEE80211_TX_PWR_ENV_INFO_COUNT 0x7 +#define IEEE80211_TX_PWR_ENV_INFO_INTERPRET 0x38 +#define IEEE80211_TX_PWR_ENV_INFO_CATEGORY 0xC0 + /* * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size * @he_oper_ie: byte data of the He Operations IE, stating from the byte @@ -2919,7 +2957,7 @@ enum ieee80211_eid { WLAN_EID_VHT_OPERATION = 192, WLAN_EID_EXTENDED_BSS_LOAD = 193, WLAN_EID_WIDE_BW_CHANNEL_SWITCH = 194, - WLAN_EID_VHT_TX_POWER_ENVELOPE = 195, + WLAN_EID_TX_POWER_ENVELOPE = 195, WLAN_EID_CHANNEL_SWITCH_WRAPPER = 196, WLAN_EID_AID = 197, WLAN_EID_QUIET_CHANNEL = 198, -- cgit v1.2.3 From d0efb16294d145d157432feda83877ae9d7cdf37 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 26 Aug 2021 12:46:01 -0700 Subject: net: don't unconditionally copy_from_user a struct ifreq for socket ioctls A common implementation of isatty(3) involves calling a ioctl passing a dummy struct argument and checking whether the syscall failed -- bionic and glibc use TCGETS (passing a struct termios), and musl uses TIOCGWINSZ (passing a struct winsize). If the FD is a socket, we will copy sizeof(struct ifreq) bytes of data from the argument and return -EFAULT if that fails. The result is that the isatty implementations may return a non-POSIX-compliant value in errno in the case where part of the dummy struct argument is inaccessible, as both struct termios and struct winsize are smaller than struct ifreq (at least on arm64). Although there is usually enough stack space following the argument on the stack that this did not present a practical problem up to now, with MTE stack instrumentation it's more likely for the copy to fail, as the memory following the struct may have a different tag. Fix the problem by adding an early check for whether the ioctl is a valid socket ioctl, and return -ENOTTY if it isn't. Fixes: 44c02a2c3dc5 ("dev_ioctl(): move copyin/copyout to callers") Link: https://linux-review.googlesource.com/id/I869da6cf6daabc3e4b7b82ac979683ba05e27d4d Signed-off-by: Peter Collingbourne Cc: # 4.19 Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eaf5bb008aa9..d65ce093e5a7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4012,6 +4012,10 @@ int netdev_rx_handler_register(struct net_device *dev, void netdev_rx_handler_unregister(struct net_device *dev); bool dev_valid_name(const char *name); +static inline bool is_socket_ioctl_cmd(unsigned int cmd) +{ + return _IOC_TYPE(cmd) == SOCK_IOC_TYPE; +} int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_copyout); int dev_ifconf(struct net *net, struct ifconf *, int); -- cgit v1.2.3 From 81f9ebd43659320a88cae8ed5124c50b4d47ab66 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sun, 22 Aug 2021 01:58:00 +0200 Subject: ssb: Drop legacy header include The SSB header only uses the legacy header to get struct gpio_chip so inluce which is the right include to deal with gpio_chip. Cc: Michael Buesch Cc: Kalle Valo Signed-off-by: Linus Walleij Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210821235800.138817-1-linus.walleij@linaro.org --- include/linux/ssb/ssb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h index 0d5a2691e7e9..f9b53acb4e02 100644 --- a/include/linux/ssb/ssb.h +++ b/include/linux/ssb/ssb.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From 8d4be124062bddbb2bcb887702a0601b790b9a83 Mon Sep 17 00:00:00 2001 From: Jing Yangyang Date: Mon, 23 Aug 2021 23:13:41 -0700 Subject: ssb: fix boolreturn.cocci warning ./include/linux/ssb/ssb_driver_extif.h:200:8-9:WARNING: return of 0/1 in function 'ssb_extif_available' with return type bool Return statements in functions returning bool should use true/false instead of 1/0. Generated by: scripts/coccinelle/misc/boolreturn.cocci Reported-by: Zeal Robot Signed-off-by: Jing Yangyang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210824061341.59255-1-deng.changcheng@zte.com.cn --- include/linux/ssb/ssb_driver_extif.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ssb/ssb_driver_extif.h b/include/linux/ssb/ssb_driver_extif.h index 3f8bc973d67d..19253bfacd1a 100644 --- a/include/linux/ssb/ssb_driver_extif.h +++ b/include/linux/ssb/ssb_driver_extif.h @@ -197,7 +197,7 @@ struct ssb_extif { static inline bool ssb_extif_available(struct ssb_extif *extif) { - return 0; + return false; } static inline -- cgit v1.2.3