From 1705643faecde95bdeb11bea5ab5baed084e9f91 Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Tue, 19 Oct 2021 05:30:20 +0800 Subject: mmc: add MT7921 SDIO identifiers for MediaTek Bluetooth devices The MT7961 SDIO identifier for MediaTek Bluetooth devices were being referred in the MediaTek Bluetooth driver. Co-developed-by: Mark-yw Chen Signed-off-by: Mark-yw Chen Signed-off-by: Sean Wang Signed-off-by: Marcel Holtmann --- include/linux/mmc/sdio_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index a85c9f0bd470..53f0efa0bccf 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -105,6 +105,7 @@ #define SDIO_VENDOR_ID_MEDIATEK 0x037a #define SDIO_DEVICE_ID_MEDIATEK_MT7663 0x7663 #define SDIO_DEVICE_ID_MEDIATEK_MT7668 0x7668 +#define SDIO_DEVICE_ID_MEDIATEK_MT7961 0x7961 #define SDIO_VENDOR_ID_MICROCHIP_WILC 0x0296 #define SDIO_DEVICE_ID_MICROCHIP_WILC1000 0x5347 -- cgit v1.2.3 From 7c7e3d31e7856a8260a254f8c71db416f7f9f5a1 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 5 Nov 2021 16:23:29 -0700 Subject: bpf: Introduce helper bpf_find_vma In some profiler use cases, it is necessary to map an address to the backing file, e.g., a shared library. bpf_find_vma helper provides a flexible way to achieve this. bpf_find_vma maps an address of a task to the vma (vm_area_struct) for this address, and feed the vma to an callback BPF function. The callback function is necessary here, as we need to ensure mmap_sem is unlocked. It is necessary to lock mmap_sem for find_vma. To lock and unlock mmap_sem safely when irqs are disable, we use the same mechanism as stackmap with build_id. Specifically, when irqs are disabled, the unlocked is postponed in an irq_work. Refactor stackmap.c so that the irq_work is shared among bpf_find_vma and stackmap helpers. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Tested-by: Hengqi Chen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20211105232330.1936330-2-songliubraving@fb.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2be6dfd68df9..df3410bff4b0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2157,6 +2157,7 @@ extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto; +extern const struct bpf_func_proto bpf_find_vma_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); -- cgit v1.2.3 From f89315650ba34ec6c91a8bded72796980bee2a4d Mon Sep 17 00:00:00 2001 From: Mark Pashmfouroush Date: Wed, 10 Nov 2021 11:10:15 +0000 Subject: bpf: Add ingress_ifindex to bpf_sk_lookup It may be helpful to have access to the ifindex during bpf socket lookup. An example may be to scope certain socket lookup logic to specific interfaces, i.e. an interface may be made exempt from custom lookup code. Add the ifindex of the arriving connection to the bpf_sk_lookup API. Signed-off-by: Mark Pashmfouroush Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211110111016.5670-2-markpash@cloudflare.com --- include/linux/filter.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 24b7ed2677af..b6a216eb217a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1374,6 +1374,7 @@ struct bpf_sk_lookup_kern { const struct in6_addr *daddr; } v6; struct sock *selected_sk; + u32 ingress_ifindex; bool no_reuseport; }; @@ -1436,7 +1437,7 @@ extern struct static_key_false bpf_sk_lookup_enabled; static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 dport, - struct sock **psk) + const int ifindex, struct sock **psk) { struct bpf_prog_array *run_array; struct sock *selected_sk = NULL; @@ -1452,6 +1453,7 @@ static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol, .v4.daddr = daddr, .sport = sport, .dport = dport, + .ingress_ifindex = ifindex, }; u32 act; @@ -1474,7 +1476,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, const __be16 sport, const struct in6_addr *daddr, const u16 dport, - struct sock **psk) + const int ifindex, struct sock **psk) { struct bpf_prog_array *run_array; struct sock *selected_sk = NULL; @@ -1490,6 +1492,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, .v6.daddr = daddr, .sport = sport, .dport = dport, + .ingress_ifindex = ifindex, }; u32 act; -- cgit v1.2.3 From 9e2ad638ae3632ef916ceb39f70e3104bf8fdc97 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 12 Nov 2021 07:02:42 -0800 Subject: bpf: Extend BTF_ID_LIST_GLOBAL with parameter for number of IDs syzbot reported the following BUG w/o CONFIG_DEBUG_INFO_BTF BUG: KASAN: global-out-of-bounds in task_iter_init+0x212/0x2e7 kernel/bpf/task_iter.c:661 Read of size 4 at addr ffffffff90297404 by task swapper/0/1 CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.15.0-syzkaller #0 Hardware name: ... Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0xf/0x309 mm/kasan/report.c:256 __kasan_report mm/kasan/report.c:442 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:459 task_iter_init+0x212/0x2e7 kernel/bpf/task_iter.c:661 do_one_initcall+0x103/0x650 init/main.c:1295 do_initcall_level init/main.c:1368 [inline] do_initcalls init/main.c:1384 [inline] do_basic_setup init/main.c:1403 [inline] kernel_init_freeable+0x6b1/0x73a init/main.c:1606 kernel_init+0x1a/0x1d0 init/main.c:1497 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 This is caused by hard-coded name[1] in BTF_ID_LIST_GLOBAL (w/o CONFIG_DEBUG_INFO_BTF). Fix this by adding a parameter n to BTF_ID_LIST_GLOBAL. This avoids ifdef CONFIG_DEBUG_INFO_BTF in btf.c and filter.c. Fixes: 7c7e3d31e785 ("bpf: Introduce helper bpf_find_vma") Reported-by: syzbot+e0d81ec552a21d9071aa@syzkaller.appspotmail.com Reported-by: Eric Dumazet Suggested-by: Eric Dumazet Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20211112150243.1270987-2-songliubraving@fb.com --- include/linux/btf_ids.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 47d9abfbdb55..6bb42b785293 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -73,7 +73,7 @@ asm( \ __BTF_ID_LIST(name, local) \ extern u32 name[]; -#define BTF_ID_LIST_GLOBAL(name) \ +#define BTF_ID_LIST_GLOBAL(name, n) \ __BTF_ID_LIST(name, globl) /* The BTF_ID_LIST_SINGLE macro defines a BTF_ID_LIST with @@ -83,7 +83,7 @@ __BTF_ID_LIST(name, globl) BTF_ID_LIST(name) \ BTF_ID(prefix, typename) #define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) \ - BTF_ID_LIST_GLOBAL(name) \ + BTF_ID_LIST_GLOBAL(name, 1) \ BTF_ID(prefix, typename) /* @@ -149,7 +149,7 @@ extern struct btf_id_set name; #define BTF_ID_LIST(name) static u32 name[5]; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED -#define BTF_ID_LIST_GLOBAL(name) u32 name[1]; +#define BTF_ID_LIST_GLOBAL(name, n) u32 name[n]; #define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; #define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 name[1]; #define BTF_SET_START(name) static struct btf_id_set name = { 0 }; -- cgit v1.2.3 From d19ddb476a539fd78ad1028ae13bb38506286931 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 12 Nov 2021 07:02:43 -0800 Subject: bpf: Introduce btf_tracing_ids Similar to btf_sock_ids, btf_tracing_ids provides btf ID for task_struct, file, and vm_area_struct via easy to understand format like btf_tracing_ids[BTF_TRACING_TYPE_[TASK|file|VMA]]. Suggested-by: Alexei Starovoitov Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20211112150243.1270987-3-songliubraving@fb.com --- include/linux/btf_ids.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 6bb42b785293..919c0fde1c51 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -189,6 +189,18 @@ MAX_BTF_SOCK_TYPE, extern u32 btf_sock_ids[]; #endif -extern u32 btf_task_struct_ids[]; +#define BTF_TRACING_TYPE_xxx \ + BTF_TRACING_TYPE(BTF_TRACING_TYPE_TASK, task_struct) \ + BTF_TRACING_TYPE(BTF_TRACING_TYPE_FILE, file) \ + BTF_TRACING_TYPE(BTF_TRACING_TYPE_VMA, vm_area_struct) + +enum { +#define BTF_TRACING_TYPE(name, type) name, +BTF_TRACING_TYPE_xxx +#undef BTF_TRACING_TYPE +MAX_BTF_TRACING_TYPE, +}; + +extern u32 btf_tracing_ids[]; #endif -- cgit v1.2.3 From 02d6fdecb9c38de19065f6bed8d5214556fd061d Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Thu, 4 Nov 2021 16:00:40 +0100 Subject: regmap: allow to define reg_update_bits for no bus configuration Some device requires a special handling for reg_update_bits and can't use the normal regmap read write logic. An example is when locking is handled by the device and rmw operations requires to do atomic operations. Allow to declare a dedicated function in regmap_config for reg_update_bits in no bus configuration. Signed-off-by: Ansuel Smith Link: https://lore.kernel.org/r/20211104150040.1260-1-ansuelsmth@gmail.com Signed-off-by: Mark Brown --- include/linux/regmap.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index e3c9a25a853a..22652e5fbc38 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -290,6 +290,11 @@ typedef void (*regmap_unlock)(void *); * read operation on a bus such as SPI, I2C, etc. Most of the * devices do not need this. * @reg_write: Same as above for writing. + * @reg_update_bits: Optional callback that if filled will be used to perform + * all the update_bits(rmw) operation. Should only be provided + * if the function require special handling with lock and reg + * handling and the operation cannot be represented as a simple + * update_bits operation on a bus such as SPI, I2C, etc. * @fast_io: Register IO is fast. Use a spinlock instead of a mutex * to perform locking. This field is ignored if custom lock/unlock * functions are used (see fields lock/unlock of struct regmap_config). @@ -372,6 +377,8 @@ struct regmap_config { int (*reg_read)(void *context, unsigned int reg, unsigned int *val); int (*reg_write)(void *context, unsigned int reg, unsigned int val); + int (*reg_update_bits)(void *context, unsigned int reg, + unsigned int mask, unsigned int val); bool fast_io; -- cgit v1.2.3 From 13cae4a104d2b7205696229ba85d34cc035f8c84 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Mon, 15 Nov 2021 10:49:21 +0800 Subject: i2c: core: Allow 255 byte transfers for SMBus 3.x SMBus 3.0 increased the maximum block transfer size from 32 bytes to 255 bytes. We increase the size of struct i2c_smbus_data's block[] member. i2c_smbus_xfer() and i2c_smbus_xfer_emulated() now support 255 byte block operations, other block functions remain limited to 32 bytes for compatibility with existing callers. We allow adapters to indicate support for the larger size with I2C_FUNC_SMBUS_V3_BLOCK. Most emulated drivers should be able to use 255 byte blocks by replacing I2C_SMBUS_BLOCK_MAX with I2C_SMBUS_V3_BLOCK_MAX though some will have hardware limitations that need testing. Signed-off-by: Matt Johnston Signed-off-by: David S. Miller --- include/linux/i2c.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 16119ac1aa97..353d6b4e7a53 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -52,6 +52,19 @@ typedef int (*i2c_slave_cb_t)(struct i2c_client *client, struct module; struct property_entry; +/* SMBus 3.0 extends the maximum block read/write size to 255 (from 32). + * The larger size is only supported by some drivers, indicated by + * the I2C_FUNC_SMBUS_V3_BLOCK functionality bit. + */ +#define I2C_SMBUS_V3_BLOCK_MAX 255 /* As specified in SMBus 3.0 standard */ + +/* Note compatibility definition in uapi header with 32 byte block */ +union i2c_smbus_data { + __u8 byte; + __u16 word; + __u8 block[I2C_SMBUS_V3_BLOCK_MAX + 1]; /* block[0] is used for length */ +}; + #if IS_ENABLED(CONFIG_I2C) /* Return the Frequency mode string based on the bus frequency */ const char *i2c_freq_mode_string(u32 bus_freq_hz); -- cgit v1.2.3 From 34ae2c09d46a2d0abd907e139b466f798e4095a8 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 15 Nov 2021 10:00:27 +0000 Subject: net: phylink: add generic validate implementation Add a generic validate() implementation using the supported_interfaces and a bitmask of MAC pause/speed/duplex capabilities. This allows us to entirely eliminate many driver private validate() implementations. We expose the underlying phylink_get_linkmodes() function so that drivers which have special needs can still benefit from conversion. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phylink.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index f037470b6fb3..3563820a1765 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -20,6 +20,29 @@ enum { MLO_AN_PHY = 0, /* Conventional PHY */ MLO_AN_FIXED, /* Fixed-link mode */ MLO_AN_INBAND, /* In-band protocol */ + + MAC_SYM_PAUSE = BIT(0), + MAC_ASYM_PAUSE = BIT(1), + MAC_10HD = BIT(2), + MAC_10FD = BIT(3), + MAC_10 = MAC_10HD | MAC_10FD, + MAC_100HD = BIT(4), + MAC_100FD = BIT(5), + MAC_100 = MAC_100HD | MAC_100FD, + MAC_1000HD = BIT(6), + MAC_1000FD = BIT(7), + MAC_1000 = MAC_1000HD | MAC_1000FD, + MAC_2500FD = BIT(8), + MAC_5000FD = BIT(9), + MAC_10000FD = BIT(10), + MAC_20000FD = BIT(11), + MAC_25000FD = BIT(12), + MAC_40000FD = BIT(13), + MAC_50000FD = BIT(14), + MAC_56000FD = BIT(15), + MAC_100000FD = BIT(16), + MAC_200000FD = BIT(17), + MAC_400000FD = BIT(18), }; static inline bool phylink_autoneg_inband(unsigned int mode) @@ -69,6 +92,7 @@ enum phylink_op_type { * if MAC link is at %MLO_AN_FIXED mode. * @supported_interfaces: bitmap describing which PHY_INTERFACE_MODE_xxx * are supported by the MAC/PCS. + * @mac_capabilities: MAC pause/speed/duplex capabilities. */ struct phylink_config { struct device *dev; @@ -79,6 +103,7 @@ struct phylink_config { void (*get_fixed_state)(struct phylink_config *config, struct phylink_link_state *state); DECLARE_PHY_INTERFACE_MASK(supported_interfaces); + unsigned long mac_capabilities; }; /** @@ -442,6 +467,12 @@ void pcs_link_up(struct phylink_pcs *pcs, unsigned int mode, phy_interface_t interface, int speed, int duplex); #endif +void phylink_get_linkmodes(unsigned long *linkmodes, phy_interface_t interface, + unsigned long mac_capabilities); +void phylink_generic_validate(struct phylink_config *config, + unsigned long *supported, + struct phylink_link_state *state); + struct phylink *phylink_create(struct phylink_config *, struct fwnode_handle *, phy_interface_t iface, const struct phylink_mac_ops *mac_ops); -- cgit v1.2.3 From 2f6a470d6545841cf1891b87e360d3998ef024c8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 15 Nov 2021 07:49:46 -0800 Subject: Revert "Merge branch 'mctp-i2c-driver'" This reverts commit 71812af7234f30362b43ccff33f93890ae4c0655, reversing changes made to cc0be1ad686fb29a4d127948486f40b17fb34b50. Wolfram Sang says: Please revert. Besides the driver in net, it modifies the I2C core code. This has not been acked by the I2C maintainer (in this case me). So, please don't pull this in via the net tree. The question raised here (extending SMBus calls to 255 byte) is complicated because we need ABI backwards compatibility. Link: https://lore.kernel.org/all/YZJ9H4eM%2FM7OXVN0@shikoro/ Signed-off-by: Jakub Kicinski --- include/linux/i2c.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 353d6b4e7a53..16119ac1aa97 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -52,19 +52,6 @@ typedef int (*i2c_slave_cb_t)(struct i2c_client *client, struct module; struct property_entry; -/* SMBus 3.0 extends the maximum block read/write size to 255 (from 32). - * The larger size is only supported by some drivers, indicated by - * the I2C_FUNC_SMBUS_V3_BLOCK functionality bit. - */ -#define I2C_SMBUS_V3_BLOCK_MAX 255 /* As specified in SMBus 3.0 standard */ - -/* Note compatibility definition in uapi header with 32 byte block */ -union i2c_smbus_data { - __u8 byte; - __u16 word; - __u8 block[I2C_SMBUS_V3_BLOCK_MAX + 1]; /* block[0] is used for length */ -}; - #if IS_ENABLED(CONFIG_I2C) /* Return the Frequency mode string based on the bus frequency */ const char *i2c_freq_mode_string(u32 bus_freq_hz); -- cgit v1.2.3 From ebf7f6f0a6cdcc17a3da52b81e4b3a98c4005028 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Fri, 5 Nov 2021 09:30:00 +0800 Subject: bpf: Change value of MAX_TAIL_CALL_CNT from 32 to 33 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the current code, the actual max tail call count is 33 which is greater than MAX_TAIL_CALL_CNT (defined as 32). The actual limit is not consistent with the meaning of MAX_TAIL_CALL_CNT and thus confusing at first glance. We can see the historical evolution from commit 04fd61ab36ec ("bpf: allow bpf programs to tail-call other bpf programs") and commit f9dabe016b63 ("bpf: Undo off-by-one in interpreter tail call count limit"). In order to avoid changing existing behavior, the actual limit is 33 now, this is reasonable. After commit 874be05f525e ("bpf, tests: Add tail call test suite"), we can see there exists failed testcase. On all archs when CONFIG_BPF_JIT_ALWAYS_ON is not set: # echo 0 > /proc/sys/net/core/bpf_jit_enable # modprobe test_bpf # dmesg | grep -w FAIL Tail call error path, max count reached jited:0 ret 34 != 33 FAIL On some archs: # echo 1 > /proc/sys/net/core/bpf_jit_enable # modprobe test_bpf # dmesg | grep -w FAIL Tail call error path, max count reached jited:1 ret 34 != 33 FAIL Although the above failed testcase has been fixed in commit 18935a72eb25 ("bpf/tests: Fix error in tail call limit tests"), it would still be good to change the value of MAX_TAIL_CALL_CNT from 32 to 33 to make the code more readable. The 32-bit x86 JIT was using a limit of 32, just fix the wrong comments and limit to 33 tail calls as the constant MAX_TAIL_CALL_CNT updated. For the mips64 JIT, use "ori" instead of "addiu" as suggested by Johan Almbladh. For the riscv JIT, use RV_REG_TCC directly to save one register move as suggested by Björn Töpel. For the other implementations, no function changes, it does not change the current limit 33, the new value of MAX_TAIL_CALL_CNT can reflect the actual max tail call count, the related tail call testcases in test_bpf module and selftests can work well for the interpreter and the JIT. Here are the test results on x86_64: # uname -m x86_64 # echo 0 > /proc/sys/net/core/bpf_jit_enable # modprobe test_bpf test_suite=test_tail_calls # dmesg | tail -1 test_bpf: test_tail_calls: Summary: 8 PASSED, 0 FAILED, [0/8 JIT'ed] # rmmod test_bpf # echo 1 > /proc/sys/net/core/bpf_jit_enable # modprobe test_bpf test_suite=test_tail_calls # dmesg | tail -1 test_bpf: test_tail_calls: Summary: 8 PASSED, 0 FAILED, [8/8 JIT'ed] # rmmod test_bpf # ./test_progs -t tailcalls #142 tailcalls:OK Summary: 1/11 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Tiezhu Yang Signed-off-by: Daniel Borkmann Tested-by: Johan Almbladh Tested-by: Ilya Leoshkevich Acked-by: Björn Töpel Acked-by: Johan Almbladh Acked-by: Ilya Leoshkevich Link: https://lore.kernel.org/bpf/1636075800-3264-1-git-send-email-yangtiezhu@loongson.cn --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 56098c866704..cc7a0c36e7df 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1081,7 +1081,7 @@ struct bpf_array { }; #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ -#define MAX_TAIL_CALL_CNT 32 +#define MAX_TAIL_CALL_CNT 33 #define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ BPF_F_RDONLY_PROG | \ -- cgit v1.2.3 From 42f67eea3ba36cef2dce2e853de6ddcb2e89eb39 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:33 -0800 Subject: net: use sk_is_tcp() in more places Move sk_is_tcp() to include/net/sock.h and use it where we can. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skmsg.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 584d94be9c8b..18a717fe62eb 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -507,12 +507,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return !!psock->saved_data_ready; } -static inline bool sk_is_tcp(const struct sock *sk) -{ - return sk->sk_type == SOCK_STREAM && - sk->sk_protocol == IPPROTO_TCP; -} - static inline bool sk_is_udp(const struct sock *sk) { return sk->sk_type == SOCK_DGRAM && -- cgit v1.2.3 From f35f821935d8df76f9c92e2431a225bdff938169 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:46 -0800 Subject: tcp: defer skb freeing after socket lock is released tcp recvmsg() (or rx zerocopy) spends a fair amount of time freeing skbs after their payload has been consumed. A typical ~64KB GRO packet has to release ~45 page references, eventually going to page allocator for each of them. Currently, this freeing is performed while socket lock is held, meaning that there is a high chance that BH handler has to queue incoming packets to tcp socket backlog. This can cause additional latencies, because the user thread has to process the backlog at release_sock() time, and while doing so, additional frames can be added by BH handler. This patch adds logic to defer these frees after socket lock is released, or directly from BH handler if possible. Being able to free these skbs from BH handler helps a lot, because this avoids the usual alloc/free assymetry, when BH handler and user thread do not run on same cpu or NUMA node. One cpu can now be fully utilized for the kernel->user copy, and another cpu is handling BH processing and skb/page allocs/frees (assuming RFS is not forcing use of a single CPU) Tested: 100Gbit NIC Max throughput for one TCP_STREAM flow, over 10 runs MTU : 1500 Before: 55 Gbit After: 66 Gbit MTU : 4096+(headers) Before: 82 Gbit After: 95 Gbit Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 686a666d073d..b8b806512e16 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #if IS_ENABLED(CONFIG_NF_CONNTRACK) @@ -743,6 +744,7 @@ struct sk_buff { }; struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ struct list_head list; + struct llist_node ll_node; }; union { -- cgit v1.2.3 From 4721031c3559db8eae61df305f10c00099a7c1d0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:05:51 -0800 Subject: net: move gro definitions to include/net/gro.h include/linux/netdevice.h became too big, move gro stuff into include/net/gro.h Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 348 ---------------------------------------------- 1 file changed, 348 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ec42495a43a..d95c9839ce90 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2520,109 +2520,6 @@ static inline void netif_napi_del(struct napi_struct *napi) synchronize_net(); } -struct napi_gro_cb { - /* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */ - void *frag0; - - /* Length of frag0. */ - unsigned int frag0_len; - - /* This indicates where we are processing relative to skb->data. */ - int data_offset; - - /* This is non-zero if the packet cannot be merged with the new skb. */ - u16 flush; - - /* Save the IP ID here and check when we get to the transport layer */ - u16 flush_id; - - /* Number of segments aggregated. */ - u16 count; - - /* Start offset for remote checksum offload */ - u16 gro_remcsum_start; - - /* jiffies when first packet was created/queued */ - unsigned long age; - - /* Used in ipv6_gro_receive() and foo-over-udp */ - u16 proto; - - /* This is non-zero if the packet may be of the same flow. */ - u8 same_flow:1; - - /* Used in tunnel GRO receive */ - u8 encap_mark:1; - - /* GRO checksum is valid */ - u8 csum_valid:1; - - /* Number of checksums via CHECKSUM_UNNECESSARY */ - u8 csum_cnt:3; - - /* Free the skb? */ - u8 free:2; -#define NAPI_GRO_FREE 1 -#define NAPI_GRO_FREE_STOLEN_HEAD 2 - - /* Used in foo-over-udp, set in udp[46]_gro_receive */ - u8 is_ipv6:1; - - /* Used in GRE, set in fou/gue_gro_receive */ - u8 is_fou:1; - - /* Used to determine if flush_id can be ignored */ - u8 is_atomic:1; - - /* Number of gro_receive callbacks this packet already went through */ - u8 recursion_counter:4; - - /* GRO is done by frag_list pointer chaining. */ - u8 is_flist:1; - - /* used to support CHECKSUM_COMPLETE for tunneling protocols */ - __wsum csum; - - /* used in skb_gro_receive() slow path */ - struct sk_buff *last; -}; - -#define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) - -#define GRO_RECURSION_LIMIT 15 -static inline int gro_recursion_inc_test(struct sk_buff *skb) -{ - return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT; -} - -typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *); -static inline struct sk_buff *call_gro_receive(gro_receive_t cb, - struct list_head *head, - struct sk_buff *skb) -{ - if (unlikely(gro_recursion_inc_test(skb))) { - NAPI_GRO_CB(skb)->flush |= 1; - return NULL; - } - - return cb(head, skb); -} - -typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *, - struct sk_buff *); -static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb, - struct sock *sk, - struct list_head *head, - struct sk_buff *skb) -{ - if (unlikely(gro_recursion_inc_test(skb))) { - NAPI_GRO_CB(skb)->flush |= 1; - return NULL; - } - - return cb(sk, head, skb); -} - struct packet_type { __be16 type; /* This is really htons(ether_type). */ bool ignore_outgoing; @@ -3008,251 +2905,6 @@ int dev_restart(struct net_device *dev); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); -static inline unsigned int skb_gro_offset(const struct sk_buff *skb) -{ - return NAPI_GRO_CB(skb)->data_offset; -} - -static inline unsigned int skb_gro_len(const struct sk_buff *skb) -{ - return skb->len - NAPI_GRO_CB(skb)->data_offset; -} - -static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len) -{ - NAPI_GRO_CB(skb)->data_offset += len; -} - -static inline void *skb_gro_header_fast(struct sk_buff *skb, - unsigned int offset) -{ - return NAPI_GRO_CB(skb)->frag0 + offset; -} - -static inline int skb_gro_header_hard(struct sk_buff *skb, unsigned int hlen) -{ - return NAPI_GRO_CB(skb)->frag0_len < hlen; -} - -static inline void skb_gro_frag0_invalidate(struct sk_buff *skb) -{ - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; -} - -static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, - unsigned int offset) -{ - if (!pskb_may_pull(skb, hlen)) - return NULL; - - skb_gro_frag0_invalidate(skb); - return skb->data + offset; -} - -static inline void *skb_gro_network_header(struct sk_buff *skb) -{ - return (NAPI_GRO_CB(skb)->frag0 ?: skb->data) + - skb_network_offset(skb); -} - -static inline void skb_gro_postpull_rcsum(struct sk_buff *skb, - const void *start, unsigned int len) -{ - if (NAPI_GRO_CB(skb)->csum_valid) - NAPI_GRO_CB(skb)->csum = csum_sub(NAPI_GRO_CB(skb)->csum, - csum_partial(start, len, 0)); -} - -/* GRO checksum functions. These are logical equivalents of the normal - * checksum functions (in skbuff.h) except that they operate on the GRO - * offsets and fields in sk_buff. - */ - -__sum16 __skb_gro_checksum_complete(struct sk_buff *skb); - -static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb) -{ - return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb)); -} - -static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb, - bool zero_okay, - __sum16 check) -{ - return ((skb->ip_summed != CHECKSUM_PARTIAL || - skb_checksum_start_offset(skb) < - skb_gro_offset(skb)) && - !skb_at_gro_remcsum_start(skb) && - NAPI_GRO_CB(skb)->csum_cnt == 0 && - (!zero_okay || check)); -} - -static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb, - __wsum psum) -{ - if (NAPI_GRO_CB(skb)->csum_valid && - !csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum))) - return 0; - - NAPI_GRO_CB(skb)->csum = psum; - - return __skb_gro_checksum_complete(skb); -} - -static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) -{ - if (NAPI_GRO_CB(skb)->csum_cnt > 0) { - /* Consume a checksum from CHECKSUM_UNNECESSARY */ - NAPI_GRO_CB(skb)->csum_cnt--; - } else { - /* Update skb for CHECKSUM_UNNECESSARY and csum_level when we - * verified a new top level checksum or an encapsulated one - * during GRO. This saves work if we fallback to normal path. - */ - __skb_incr_checksum_unnecessary(skb); - } -} - -#define __skb_gro_checksum_validate(skb, proto, zero_okay, check, \ - compute_pseudo) \ -({ \ - __sum16 __ret = 0; \ - if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \ - __ret = __skb_gro_checksum_validate_complete(skb, \ - compute_pseudo(skb, proto)); \ - if (!__ret) \ - skb_gro_incr_csum_unnecessary(skb); \ - __ret; \ -}) - -#define skb_gro_checksum_validate(skb, proto, compute_pseudo) \ - __skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo) - -#define skb_gro_checksum_validate_zero_check(skb, proto, check, \ - compute_pseudo) \ - __skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo) - -#define skb_gro_checksum_simple_validate(skb) \ - __skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo) - -static inline bool __skb_gro_checksum_convert_check(struct sk_buff *skb) -{ - return (NAPI_GRO_CB(skb)->csum_cnt == 0 && - !NAPI_GRO_CB(skb)->csum_valid); -} - -static inline void __skb_gro_checksum_convert(struct sk_buff *skb, - __wsum pseudo) -{ - NAPI_GRO_CB(skb)->csum = ~pseudo; - NAPI_GRO_CB(skb)->csum_valid = 1; -} - -#define skb_gro_checksum_try_convert(skb, proto, compute_pseudo) \ -do { \ - if (__skb_gro_checksum_convert_check(skb)) \ - __skb_gro_checksum_convert(skb, \ - compute_pseudo(skb, proto)); \ -} while (0) - -struct gro_remcsum { - int offset; - __wsum delta; -}; - -static inline void skb_gro_remcsum_init(struct gro_remcsum *grc) -{ - grc->offset = 0; - grc->delta = 0; -} - -static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr, - unsigned int off, size_t hdrlen, - int start, int offset, - struct gro_remcsum *grc, - bool nopartial) -{ - __wsum delta; - size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); - - BUG_ON(!NAPI_GRO_CB(skb)->csum_valid); - - if (!nopartial) { - NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start; - return ptr; - } - - ptr = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, off + plen)) { - ptr = skb_gro_header_slow(skb, off + plen, off); - if (!ptr) - return NULL; - } - - delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum, - start, offset); - - /* Adjust skb->csum since we changed the packet */ - NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); - - grc->offset = off + hdrlen + offset; - grc->delta = delta; - - return ptr; -} - -static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, - struct gro_remcsum *grc) -{ - void *ptr; - size_t plen = grc->offset + sizeof(u16); - - if (!grc->delta) - return; - - ptr = skb_gro_header_fast(skb, grc->offset); - if (skb_gro_header_hard(skb, grc->offset + sizeof(u16))) { - ptr = skb_gro_header_slow(skb, plen, grc->offset); - if (!ptr) - return; - } - - remcsum_unadjust((__sum16 *)ptr, grc->delta); -} - -#ifdef CONFIG_XFRM_OFFLOAD -static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) -{ - if (PTR_ERR(pp) != -EINPROGRESS) - NAPI_GRO_CB(skb)->flush |= flush; -} -static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, - struct sk_buff *pp, - int flush, - struct gro_remcsum *grc) -{ - if (PTR_ERR(pp) != -EINPROGRESS) { - NAPI_GRO_CB(skb)->flush |= flush; - skb_gro_remcsum_cleanup(skb, grc); - skb->remcsum_offload = 0; - } -} -#else -static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) -{ - NAPI_GRO_CB(skb)->flush |= flush; -} -static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, - struct sk_buff *pp, - int flush, - struct gro_remcsum *grc) -{ - NAPI_GRO_CB(skb)->flush |= flush; - skb_gro_remcsum_cleanup(skb, grc); - skb->remcsum_offload = 0; -} -#endif static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, -- cgit v1.2.3 From 0b935d7f8c07bf0a192712bdbf76dbf45ef8b115 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:05:52 -0800 Subject: net: gro: move skb_gro_receive_list to udp_offload.c This helper is used once, no need to keep it in fat net/core/skbuff.c Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d95c9839ce90..ce6ee1453dbc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2903,7 +2903,6 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id); int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); -int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, -- cgit v1.2.3 From e456a18a390b96f22b0de2acd4d0f49c72ed2280 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:05:53 -0800 Subject: net: gro: move skb_gro_receive into net/core/gro.c net/core/gro.c will contain all core gro functions, to shrink net/core/skbuff.c and net/core/dev.c Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ce6ee1453dbc..93d397db9ec4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2902,7 +2902,6 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); struct net_device *dev_get_by_napi_id(unsigned int napi_id); int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); -int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, -- cgit v1.2.3 From 587652bbdd06ab38a4c1b85e40f933d2cf4a1147 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:05:54 -0800 Subject: net: gro: populate net/core/gro.c Move gro code and data from net/core/dev.c to net/core/gro.c to ease maintenance. gro_normal_list() and gro_normal_one() are inlined because they are called from both files. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 93d397db9ec4..31a7e6b27681 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3657,6 +3657,7 @@ int netif_rx_ni(struct sk_buff *skb); int netif_rx_any_context(struct sk_buff *skb); int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); +void netif_receive_skb_list_internal(struct list_head *head); void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); -- cgit v1.2.3 From c2c60ea37e5b6be58c9dd7aff0b2e86ba0f18e0b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:23:01 -0800 Subject: once: use __section(".data.once") .data.once contains nicely packed bool variables. It is used already by DO_ONCE_LITE(). Using it also in DO_ONCE() removes holes in .data section. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/once.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/once.h b/include/linux/once.h index d361fb14ac3a..f54523052bbc 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -38,7 +38,7 @@ void __do_once_done(bool *done, struct static_key_true *once_key, #define DO_ONCE(func, ...) \ ({ \ bool ___ret = false; \ - static bool ___done = false; \ + static bool __section(".data.once") ___done = false; \ static DEFINE_STATIC_KEY_TRUE(___once_key); \ if (static_branch_unlikely(&___once_key)) { \ unsigned long ___flags; \ -- cgit v1.2.3 From 7071732c26fe2cf141185ed16a8a85d02495ae8c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:23:02 -0800 Subject: net: use .data.once section in netdev_level_once() Same rationale than prior patch : using the dedicated section avoid holes and pack all these bool values. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 31a7e6b27681..dd328364dfe9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4942,7 +4942,7 @@ void netdev_info(const struct net_device *dev, const char *format, ...); #define netdev_level_once(level, dev, fmt, ...) \ do { \ - static bool __print_once __read_mostly; \ + static bool __section(".data.once") __print_once; \ \ if (!__print_once) { \ __print_once = true; \ -- cgit v1.2.3 From 49ecc2e9c3abd269951972fa8b23a4d081111b80 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:23:03 -0800 Subject: net: align static siphash keys siphash keys use 16 bytes. Define siphash_aligned_key_t macro so that we can make sure they are not crossing a cache line boundary. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/siphash.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/siphash.h b/include/linux/siphash.h index bf21591a9e5e..3f7427b9e935 100644 --- a/include/linux/siphash.h +++ b/include/linux/siphash.h @@ -21,6 +21,8 @@ typedef struct { u64 key[2]; } siphash_key_t; +#define siphash_aligned_key_t siphash_key_t __aligned(16) + static inline bool siphash_key_is_zero(const siphash_key_t *key) { return !(key->key[0] | key->key[1]); -- cgit v1.2.3 From b9241f54138ca5af4d3c5ca6db56be83d7491508 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 15 Nov 2021 17:11:17 +0000 Subject: net: document SMII and correct phylink's new validation mechanism SMII has not been documented in the kernel, but information on this PHY interface mode has been recently found. Document it, and correct the recently introduced phylink handling for this interface mode. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/E1mmfVl-0075nP-14@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 96e43fbb2dd8..1e57cdd95da3 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -99,7 +99,7 @@ extern const int phy_10gbit_features_array[1]; * @PHY_INTERFACE_MODE_RGMII_RXID: RGMII with Internal RX delay * @PHY_INTERFACE_MODE_RGMII_TXID: RGMII with Internal RX delay * @PHY_INTERFACE_MODE_RTBI: Reduced TBI - * @PHY_INTERFACE_MODE_SMII: ??? MII + * @PHY_INTERFACE_MODE_SMII: Serial MII * @PHY_INTERFACE_MODE_XGMII: 10 gigabit media-independent interface * @PHY_INTERFACE_MODE_XLGMII:40 gigabit media-independent interface * @PHY_INTERFACE_MODE_MOCA: Multimedia over Coax -- cgit v1.2.3 From 8160fb43d55d26d64607fd32fe69185a5f5fe41f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Nov 2021 19:29:21 -0800 Subject: net: use an atomic_long_t for queue->trans_timeout tx_timeout_show() assumed dev_watchdog() would stop all the queues, to fetch queue->trans_timeout under protection of the queue->_xmit_lock. As we want to no longer disrupt transmits, we use an atomic_long_t instead. Signed-off-by: Eric Dumazet Cc: david decotigny Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dd328364dfe9..1d22483cf78c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -592,7 +592,7 @@ struct netdev_queue { * Number of TX timeouts for this queue * (/sys/class/net/DEV/Q/trans_timeout) */ - unsigned long trans_timeout; + atomic_long_t trans_timeout; /* Subordinate device that the queue has been assigned to */ struct net_device *sb_dev; -- cgit v1.2.3 From 5337824f4dc4bb26f38fbbba4ffb425a92803f15 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Nov 2021 19:29:22 -0800 Subject: net: annotate accesses to queue->trans_start In following patches, dev_watchdog() will no longer stop all queues. It will read queue->trans_start locklessly. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1d22483cf78c..279409ef2b18 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4095,10 +4095,21 @@ static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) spin_unlock_bh(&txq->_xmit_lock); } +/* + * txq->trans_start can be read locklessly from dev_watchdog() + */ static inline void txq_trans_update(struct netdev_queue *txq) { if (txq->xmit_lock_owner != -1) - txq->trans_start = jiffies; + WRITE_ONCE(txq->trans_start, jiffies); +} + +static inline void txq_trans_cond_update(struct netdev_queue *txq) +{ + unsigned long now = jiffies; + + if (READ_ONCE(txq->trans_start) != now) + WRITE_ONCE(txq->trans_start, now); } /* legacy drivers only, netdev_start_xmit() sets txq->trans_start */ @@ -4106,8 +4117,7 @@ static inline void netif_trans_update(struct net_device *dev) { struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); - if (txq->trans_start != jiffies) - txq->trans_start = jiffies; + txq_trans_cond_update(txq); } /** -- cgit v1.2.3 From dab8fe320726b38a6b1dc6a7ca6e386c5f7779e8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Nov 2021 19:29:23 -0800 Subject: net: do not inline netif_tx_lock()/netif_tx_unlock() These are not fast path, there is no point in inlining them. Also provide netif_freeze_queues()/netif_unfreeze_queues() so that we can use them from dev_watchdog() in the following patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 39 ++------------------------------------- 1 file changed, 2 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 279409ef2b18..4f4a299e92de 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4126,27 +4126,7 @@ static inline void netif_trans_update(struct net_device *dev) * * Get network device transmit lock */ -static inline void netif_tx_lock(struct net_device *dev) -{ - unsigned int i; - int cpu; - - spin_lock(&dev->tx_global_lock); - cpu = smp_processor_id(); - for (i = 0; i < dev->num_tx_queues; i++) { - struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - - /* We are the only thread of execution doing a - * freeze, but we have to grab the _xmit_lock in - * order to synchronize with threads which are in - * the ->hard_start_xmit() handler and already - * checked the frozen bit. - */ - __netif_tx_lock(txq, cpu); - set_bit(__QUEUE_STATE_FROZEN, &txq->state); - __netif_tx_unlock(txq); - } -} +void netif_tx_lock(struct net_device *dev); static inline void netif_tx_lock_bh(struct net_device *dev) { @@ -4154,22 +4134,7 @@ static inline void netif_tx_lock_bh(struct net_device *dev) netif_tx_lock(dev); } -static inline void netif_tx_unlock(struct net_device *dev) -{ - unsigned int i; - - for (i = 0; i < dev->num_tx_queues; i++) { - struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - - /* No need to grab the _xmit_lock here. If the - * queue is not stopped for another reason, we - * force a schedule. - */ - clear_bit(__QUEUE_STATE_FROZEN, &txq->state); - netif_schedule_queue(txq); - } - spin_unlock(&dev->tx_global_lock); -} +void netif_tx_unlock(struct net_device *dev); static inline void netif_tx_unlock_bh(struct net_device *dev) { -- cgit v1.2.3 From df6160deb3debe6f964c16349f9431157ff67dda Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Nov 2021 17:57:29 -0800 Subject: tcp: add missing htmldocs for skb->ll_node and sk->defer_list Add missing entries to fix these "make htmldocs" warnings. ./include/linux/skbuff.h:953: warning: Function parameter or member 'll_node' not described in 'sk_buff' ./include/net/sock.h:540: warning: Function parameter or member 'defer_list' not described in 'sock' Fixes: f35f821935d8 ("tcp: defer skb freeing after socket lock is released") Signed-off-by: Eric Dumazet Reported-by: Stephen Rothwell Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b8b806512e16..100fd604fbc9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -627,6 +627,7 @@ typedef unsigned char *sk_buff_data_t; * for retransmit timer * @rbnode: RB tree node, alternative to next/prev for netem/tcp * @list: queue head + * @ll_node: anchor in an llist (eg socket defer_list) * @sk: Socket we are owned by * @ip_defrag_offset: (aka @sk) alternate use of @sk, used in * fragmentation management -- cgit v1.2.3 From adeef3e32146a8d2a73c399dc6f5d76a449131b1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Nov 2021 06:21:51 -0800 Subject: net: constify netdev->dev_addr Commit 406f42fa0d3c ("net-next: When a bond have a massive amount of VLANs...") introduced a rbtree for faster Ethernet address look up. We converted all users to make modifications via appropriate helpers, make netdev->dev_addr const. The update helpers need to upcast from the buffer to struct netdev_hw_addr. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4f4a299e92de..2462195784a9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2117,7 +2117,7 @@ struct net_device { * Cache lines mostly used on receive path (including eth_type_trans()) */ /* Interface address info used in eth_type_trans() */ - unsigned char *dev_addr; + const unsigned char *dev_addr; struct netdev_rx_queue *_rx; unsigned int num_rx_queues; @@ -4268,10 +4268,13 @@ void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, void __hw_addr_init(struct netdev_hw_addr_list *list); /* Functions used for device addresses handling */ +void dev_addr_mod(struct net_device *dev, unsigned int offset, + const void *addr, size_t len); + static inline void __dev_addr_set(struct net_device *dev, const void *addr, size_t len) { - memcpy(dev->dev_addr, addr, len); + dev_addr_mod(dev, 0, addr, len); } static inline void dev_addr_set(struct net_device *dev, const u8 *addr) @@ -4279,13 +4282,6 @@ static inline void dev_addr_set(struct net_device *dev, const u8 *addr) __dev_addr_set(dev, addr, dev->addr_len); } -static inline void -dev_addr_mod(struct net_device *dev, unsigned int offset, - const void *addr, size_t len) -{ - memcpy(&dev->dev_addr[offset], addr, len); -} - int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); int dev_addr_del(struct net_device *dev, const unsigned char *addr, -- cgit v1.2.3 From d07b26f5bbea9ade34dfd6abea7b3ca056c03cd1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Nov 2021 06:21:53 -0800 Subject: dev_addr: add a modification check netdev->dev_addr should only be modified via helpers, but someone may be casting off the const. Add a runtime check to catch abuses. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2462195784a9..cb7f2661d187 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1942,6 +1942,8 @@ enum netdev_ml_priv_type { * @unlink_list: As netif_addr_lock() can be called recursively, * keep a list of interfaces to be deleted. * + * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2268,6 +2270,8 @@ struct net_device { /* protected by rtnl_lock */ struct bpf_xdp_entity xdp_state[__MAX_XDP_MODE]; + + u8 dev_addr_shadow[MAX_ADDR_LEN]; }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -4288,6 +4292,7 @@ int dev_addr_del(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); void dev_addr_flush(struct net_device *dev); int dev_addr_init(struct net_device *dev); +void dev_addr_check(struct net_device *dev); /* Functions used for unicast addresses handling */ int dev_uc_add(struct net_device *dev, const unsigned char *addr); -- cgit v1.2.3 From 0b70c256eba8448b072d25c95ee65e59da8970de Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 18 Nov 2021 20:12:42 +0800 Subject: ethtool: add support to set/get rx buf len via ethtool Add support to set rx buf len via ethtool -G parameter and get rx buf len via ethtool -g parameter. Signed-off-by: Hao Chen Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- include/linux/ethtool.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 845a0ffc16ee..0b252b82988b 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -67,6 +67,22 @@ enum { ETH_RSS_HASH_FUNCS_COUNT }; +/** + * struct kernel_ethtool_ringparam - RX/TX ring configuration + * @rx_buf_len: Current length of buffers on the rx ring. + */ +struct kernel_ethtool_ringparam { + u32 rx_buf_len; +}; + +/** + * enum ethtool_supported_ring_param - indicator caps for setting ring params + * @ETHTOOL_RING_USE_RX_BUF_LEN: capture for setting rx_buf_len + */ +enum ethtool_supported_ring_param { + ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0), +}; + #define __ETH_RSS_HASH_BIT(bit) ((u32)1 << (bit)) #define __ETH_RSS_HASH(name) __ETH_RSS_HASH_BIT(ETH_RSS_HASH_##name##_BIT) @@ -432,6 +448,7 @@ struct ethtool_module_power_mode_params { * @cap_link_lanes_supported: indicates if the driver supports lanes * parameter. * @supported_coalesce_params: supported types of interrupt coalescing. + * @supported_ring_params: supported ring params. * @get_drvinfo: Report driver/device information. Should only set the * @driver, @version, @fw_version and @bus_info fields. If not * implemented, the @driver and @bus_info fields will be filled in @@ -613,6 +630,7 @@ struct ethtool_module_power_mode_params { struct ethtool_ops { u32 cap_link_lanes_supported:1; u32 supported_coalesce_params; + u32 supported_ring_params; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); int (*get_regs_len)(struct net_device *); void (*get_regs)(struct net_device *, struct ethtool_regs *, void *); -- cgit v1.2.3 From 7462494408cd3de8b0bc1e79670bf213288501d0 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 18 Nov 2021 20:12:43 +0800 Subject: ethtool: extend ringparam setting/getting API with rx_buf_len Add two new parameters kernel_ringparam and extack for .get_ringparam and .set_ringparam to extend more ring params through netlink. Signed-off-by: Hao Chen Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- include/linux/ethtool.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 0b252b82988b..a26f37a27167 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -656,9 +656,13 @@ struct ethtool_ops { struct kernel_ethtool_coalesce *, struct netlink_ext_ack *); void (*get_ringparam)(struct net_device *, - struct ethtool_ringparam *); + struct ethtool_ringparam *, + struct kernel_ethtool_ringparam *, + struct netlink_ext_ack *); int (*set_ringparam)(struct net_device *, - struct ethtool_ringparam *); + struct ethtool_ringparam *, + struct kernel_ethtool_ringparam *, + struct netlink_ext_ack *); void (*get_pause_stats)(struct net_device *dev, struct ethtool_pause_stats *pause_stats); void (*get_pauseparam)(struct net_device *, -- cgit v1.2.3 From 4b66d2161b8125b6caa6971815e85631cf3cf36f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Nov 2021 07:43:31 -0800 Subject: net: annotate accesses to dev->gso_max_size dev->gso_max_size is written under RTNL protection, or when the device is not yet visible, but is read locklessly. Add the READ_ONCE()/WRITE_ONCE() pairs, and use netif_set_gso_max_size() where we can to better document what is going on. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cb7f2661d187..14eeb58ed197 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4731,7 +4731,8 @@ static inline bool netif_needs_gso(struct sk_buff *skb, static inline void netif_set_gso_max_size(struct net_device *dev, unsigned int size) { - dev->gso_max_size = size; + /* dev->gso_max_size is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_size, size); } static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, -- cgit v1.2.3 From 6d872df3e3b91532b142de9044e5b4984017a55f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Nov 2021 07:43:32 -0800 Subject: net: annotate accesses to dev->gso_max_segs dev->gso_max_segs is written under RTNL protection, or when the device is not yet visible, but is read locklessly. Add netif_set_gso_max_segs() helper. Add the READ_ONCE()/WRITE_ONCE() pairs, and use netif_set_gso_max_segs() where we can to better document what is going on. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 14eeb58ed197..df049864661d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4735,6 +4735,13 @@ static inline void netif_set_gso_max_size(struct net_device *dev, WRITE_ONCE(dev->gso_max_size, size); } +static inline void netif_set_gso_max_segs(struct net_device *dev, + unsigned int segs) +{ + /* dev->gso_max_segs is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_segs, segs); +} + static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) -- cgit v1.2.3 From 291dcae39bc482f7edc91a50d97027327e0cf5f9 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Fri, 19 Nov 2021 10:58:09 -0500 Subject: net: phylink: Add helpers for c22 registers without MDIO Some devices expose memory-mapped c22-compliant PHYs. Because these devices do not have an MDIO bus, we cannot use the existing helpers. Refactor the existing helpers to allow supplying the values for c22 registers directly, instead of using MDIO to access them. Only get_state and set_advertisement are converted, since they contain the most complex logic. Because set_advertisement is never actually used outside phylink_mii_c22_pcs_config, move the MDIO-writing part into that function. Because some modes do not need the advertisement register set at all, we use -EINVAL for this purpose. Additionally, a new function phylink_pcs_enable_an is provided to determine whether to enable autonegotiation. Signed-off-by: Sean Anderson Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phylink.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 3563820a1765..01224235df0f 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -527,11 +527,12 @@ void phylink_set_port_modes(unsigned long *bits); void phylink_set_10g_modes(unsigned long *mask); void phylink_helper_basex_speed(struct phylink_link_state *state); +void phylink_mii_c22_pcs_decode_state(struct phylink_link_state *state, + u16 bmsr, u16 lpa); void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs, struct phylink_link_state *state); -int phylink_mii_c22_pcs_set_advertisement(struct mdio_device *pcs, - phy_interface_t interface, - const unsigned long *advertising); +int phylink_mii_c22_pcs_encode_advertisement(phy_interface_t interface, + const unsigned long *advertising); int phylink_mii_c22_pcs_config(struct mdio_device *pcs, unsigned int mode, phy_interface_t interface, const unsigned long *advertising); -- cgit v1.2.3 From c4804670026b93f4ebddda30af89fd737bf93931 Mon Sep 17 00:00:00 2001 From: M Chetan Kumar Date: Sat, 20 Nov 2021 21:51:54 +0530 Subject: net: wwan: common debugfs base dir for wwan device This patch set brings in a common debugfs base directory i.e. /sys/kernel/debugfs/wwan/ in WWAN Subsystem for a WWAN device instance. So that it avoids driver polluting debugfs root with unrelated directories & possible name collusion. Having a common debugfs base directory for WWAN drivers eases user to match control devices with debugfs entries. WWAN Subsystem creates dentry (/sys/kernel/debugfs/wwan) on module load & removes dentry on module unload. When driver registers a new wwan device, dentry (wwanX) is created for WWAN device instance & on driver unregister dentry is removed. New API is introduced to return the wwan device instance dentry so that driver can create debugfs entries under it. Signed-off-by: M Chetan Kumar Reviewed-by: Loic Poulain Signed-off-by: David S. Miller --- include/linux/wwan.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wwan.h b/include/linux/wwan.h index 9fac819f92e3..1646aa3e6779 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -171,4 +171,6 @@ int wwan_register_ops(struct device *parent, const struct wwan_ops *ops, void wwan_unregister_ops(struct device *parent); +struct dentry *wwan_get_debugfs_dir(struct device *parent); + #endif /* __WWAN_H */ -- cgit v1.2.3 From fba84957e2e2e201cf4e352efe0c7cac0fbb5d5d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 20 Nov 2021 16:31:48 -0800 Subject: skbuff: Move conditional preprocessor directives out of struct sk_buff In preparation for using the struct_group() macro in struct sk_buff, move the conditional preprocessor directives out of the region of struct sk_buff that will be enclosed by struct_group(). While GCC and Clang are happy with conditional preprocessor directives here, sparse is not, even under -Wno-directive-within-macro[1], as would be seen under a C=1 build: net/core/filter.c: note: in included file (through include/linux/netlink.h, include/linux/sock_diag.h): ./include/linux/skbuff.h:820:1: warning: directive in macro's argument list ./include/linux/skbuff.h:822:1: warning: directive in macro's argument list ./include/linux/skbuff.h:846:1: warning: directive in macro's argument list ./include/linux/skbuff.h:848:1: warning: directive in macro's argument list Additionally remove empty macro argument definitions and usage. "objdump -d" shows no object code differences. [1] https://www.spinics.net/lists/linux-sparse/msg10857.html Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- include/linux/skbuff.h | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 059b6266dcd7..c7889afe0d0d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -795,7 +795,7 @@ struct sk_buff { #else #define CLONED_MASK 1 #endif -#define CLONED_OFFSET() offsetof(struct sk_buff, __cloned_offset) +#define CLONED_OFFSET offsetof(struct sk_buff, __cloned_offset) /* private: */ __u8 __cloned_offset[0]; @@ -818,18 +818,10 @@ struct sk_buff { __u32 headers_start[0]; /* public: */ -/* if you move pkt_type around you also must adapt those constants */ -#ifdef __BIG_ENDIAN_BITFIELD -#define PKT_TYPE_MAX (7 << 5) -#else -#define PKT_TYPE_MAX 7 -#endif -#define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset) - /* private: */ __u8 __pkt_type_offset[0]; /* public: */ - __u8 pkt_type:3; + __u8 pkt_type:3; /* see PKT_TYPE_MAX */ __u8 ignore_df:1; __u8 nf_trace:1; __u8 ip_summed:2; @@ -845,16 +837,10 @@ struct sk_buff { __u8 encap_hdr_csum:1; __u8 csum_valid:1; -#ifdef __BIG_ENDIAN_BITFIELD -#define PKT_VLAN_PRESENT_BIT 7 -#else -#define PKT_VLAN_PRESENT_BIT 0 -#endif -#define PKT_VLAN_PRESENT_OFFSET() offsetof(struct sk_buff, __pkt_vlan_present_offset) /* private: */ __u8 __pkt_vlan_present_offset[0]; /* public: */ - __u8 vlan_present:1; + __u8 vlan_present:1; /* See PKT_VLAN_PRESENT_BIT */ __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 csum_not_inet:1; @@ -953,6 +939,22 @@ struct sk_buff { #endif }; +/* if you move pkt_type around you also must adapt those constants */ +#ifdef __BIG_ENDIAN_BITFIELD +#define PKT_TYPE_MAX (7 << 5) +#else +#define PKT_TYPE_MAX 7 +#endif +#define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset) + +/* if you move pkt_vlan_present around you also must adapt these constants */ +#ifdef __BIG_ENDIAN_BITFIELD +#define PKT_VLAN_PRESENT_BIT 7 +#else +#define PKT_VLAN_PRESENT_BIT 0 +#endif +#define PKT_VLAN_PRESENT_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset) + #ifdef __KERNEL__ /* * Handling routines are only of interest to the kernel -- cgit v1.2.3 From 03f61041c17914355dde7261be9ccdc821ddd454 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 20 Nov 2021 16:31:49 -0800 Subject: skbuff: Switch structure bounds to struct_group() In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memcpy(), memmove(), and memset(), avoid intentionally writing across neighboring fields. Replace the existing empty member position markers "headers_start" and "headers_end" with a struct_group(). This will allow memcpy() and sizeof() to more easily reason about sizes, and improve readability. "pahole" shows no size nor member offset changes to struct sk_buff. "objdump -d" shows no object code changes (outside of WARNs affected by source line number changes). Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Reviewed-by: Jason A. Donenfeld # drivers/net/wireguard/* Link: https://lore.kernel.org/lkml/20210728035006.GD35706@embeddedor Signed-off-by: David S. Miller --- include/linux/skbuff.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c7889afe0d0d..eba256af64a5 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -811,12 +811,10 @@ struct sk_buff { __u8 active_extensions; #endif - /* fields enclosed in headers_start/headers_end are copied + /* Fields enclosed in headers group are copied * using a single memcpy() in __copy_skb_header() */ - /* private: */ - __u32 headers_start[0]; - /* public: */ + struct_group(headers, /* private: */ __u8 __pkt_type_offset[0]; @@ -921,9 +919,7 @@ struct sk_buff { u64 kcov_handle; #endif - /* private: */ - __u32 headers_end[0]; - /* public: */ + ); /* end headers group */ /* These elements must be at the end, see alloc_skb() for details. */ sk_buff_data_t tail; -- cgit v1.2.3 From e523af4ee56090fbdd9cf474752448d35930bcd4 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Mon, 18 Oct 2021 18:16:02 -0500 Subject: net/ice: Add support for enable_iwarp and enable_roce devlink param Allow support for 'enable_iwarp' and 'enable_roce' devlink params to turn on/off iWARP or RoCE protocol support for E800 devices. For example, a user can turn on iWARP functionality with, devlink dev param set pci/0000:07:00.0 name enable_iwarp value true cmode runtime This add an iWARP auxiliary rdma device, ice.iwarp.<>, under this PF. A user request to enable both iWARP and RoCE under the same PF is rejected since this device does not support both protocols simultaneously on the same port. Signed-off-by: Shiraz Saleem Tested-by: Leszek Kaliszczuk Signed-off-by: Tony Nguyen --- include/linux/net/intel/iidc.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net/intel/iidc.h b/include/linux/net/intel/iidc.h index e32f6712aee0..1289593411d3 100644 --- a/include/linux/net/intel/iidc.h +++ b/include/linux/net/intel/iidc.h @@ -26,6 +26,11 @@ enum iidc_reset_type { IIDC_GLOBR, }; +enum iidc_rdma_protocol { + IIDC_RDMA_PROTOCOL_IWARP = BIT(0), + IIDC_RDMA_PROTOCOL_ROCEV2 = BIT(1), +}; + #define IIDC_MAX_USER_PRIORITY 8 /* Struct to hold per RDMA Qset info */ @@ -70,8 +75,6 @@ int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type); int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos); -#define IIDC_RDMA_ROCE_NAME "roce" - /* Structure representing auxiliary driver tailored information about the core * PCI dev, each auxiliary driver using the IIDC interface will have an * instance of this struct dedicated to it. -- cgit v1.2.3 From 2106efda785b55a8957efed9a52dfa28ee0d7280 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 22 Nov 2021 17:24:47 -0800 Subject: net: remove .ndo_change_proto_down .ndo_change_proto_down was added seemingly to enable out-of-tree implementations. Over 2.5yrs later we still have no real users upstream. Hardwire the generic implementation for now, we can revert once real users materialize. (rocker is a test vehicle, not a user.) We need to drop the optimization on the sysfs side, because unlike ndos priv_flags will be changed at runtime, so we'd need READ_ONCE/WRITE_ONCE everywhere.. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index df049864661d..db3bff1ae7fd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1297,11 +1297,6 @@ struct netdev_net_notifier { * TX queue. * int (*ndo_get_iflink)(const struct net_device *dev); * Called to get the iflink value of this device. - * void (*ndo_change_proto_down)(struct net_device *dev, - * bool proto_down); - * This function is used to pass protocol port error state information - * to the switch driver. The switch driver can react to the proto_down - * by doing a phys down on the associated switch port. * int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); * This function is used to get egress tunnel information for given skb. * This is useful for retrieving outer tunnel header parameters while @@ -1542,8 +1537,6 @@ struct net_device_ops { int queue_index, u32 maxrate); int (*ndo_get_iflink)(const struct net_device *dev); - int (*ndo_change_proto_down)(struct net_device *dev, - bool proto_down); int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); void (*ndo_set_rx_headroom)(struct net_device *dev, @@ -1612,6 +1605,7 @@ struct net_device_ops { * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with * skb_headlen(skb) == 0 (data starts from frag0) + * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1646,6 +1640,7 @@ enum netdev_priv_flags { IFF_L3MDEV_RX_HANDLER = 1<<29, IFF_LIVE_RENAME_OK = 1<<30, IFF_TX_SKB_NO_LINEAR = 1<<31, + IFF_CHANGE_PROTO_DOWN = BIT_ULL(32), }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1982,7 +1977,7 @@ struct net_device { /* Read-mostly cache-line for fast-path access */ unsigned int flags; - unsigned int priv_flags; + unsigned long long priv_flags; const struct net_device_ops *netdev_ops; int ifindex; unsigned short gflags; @@ -3735,7 +3730,6 @@ int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse); bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); int dev_change_proto_down(struct net_device *dev, bool proto_down); -int dev_change_proto_down_generic(struct net_device *dev, bool proto_down); void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, u32 value); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); -- cgit v1.2.3 From c6d5f1933085f9a92ed5c256a859ab31c7a35f88 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Mon, 22 Nov 2021 12:19:31 +0100 Subject: net: stmmac: Calculate CDC error only once The clock domain crossing error (CDC) is calculated at every fetch of Tx or Rx timestamps. It includes a division. Especially on arm32 based systems it is expensive. It also requires two conditionals in the hotpath. Add a compensation value cache to struct plat_stmmacenet_data and subtract it unconditionally in the RX/TX functions which spares the conditionals. The value is initialized to 0 and if supported calculated in the PTP initialization code. Suggested-by: Thomas Gleixner Signed-off-by: Kurt Kanzenbach Link: https://lore.kernel.org/r/20211122111931.135135-1-kurt@linutronix.de Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index a6f03b36fc4f..89b8e208cd7b 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -241,6 +241,7 @@ struct plat_stmmacenet_data { unsigned int clk_ref_rate; unsigned int mult_fact_100ns; s32 ptp_max_adj; + u32 cdc_error_adj; struct reset_control *stmmac_rst; struct reset_control *stmmac_ahb_rst; struct stmmac_axi *axi; -- cgit v1.2.3 From 29c3002644bdd653f6ec6407d25135d0a4f7cefb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Nov 2021 12:24:46 -0800 Subject: net: optimize skb_postpull_rcsum() Remove one pair of add/adc instructions and their dependency against carry flag. We can leverage third argument to csum_partial(): X = csum_block_sub(X, csum_partial(start, len, 0), 0); --> X = csum_block_add(X, ~csum_partial(start, len, 0), 0); --> X = ~csum_partial(start, len, ~X); Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index eba256af64a5..eae4bd3237a4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3485,7 +3485,11 @@ __skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len, static inline void skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { - __skb_postpull_rcsum(skb, start, len, 0); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = ~csum_partial(start, len, ~skb->csum); + else if (skb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_start_offset(skb) < 0) + skb->ip_summed = CHECKSUM_NONE; } static __always_inline void -- cgit v1.2.3 From 2cca3465147d650be3de04927a99784b30251ade Mon Sep 17 00:00:00 2001 From: Alexander Usyskin Date: Fri, 12 Nov 2021 08:28:09 +0200 Subject: mei: bus: add client dma interface Expose the client dma mapping via mei client bus interface. The client dma has to be mapped before the device is enabled, therefore we need to create device linking already during mapping and we need to unmap after the client is disable hence we need to postpone the unlink and flush till unmapping or when destroying the device. Signed-off-by: Alexander Usyskin Co-developed-by: Tomas Winkler Signed-off-by: Tomas Winkler Signed-off-by: Emmanuel Grumbach Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20210420172755.12178-1-emmanuel.grumbach@intel.com Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211112062814.7502-1-emmanuel.grumbach@intel.com --- include/linux/mei_cl_bus.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h index c6786c12b207..df1fab44ea5c 100644 --- a/include/linux/mei_cl_bus.h +++ b/include/linux/mei_cl_bus.h @@ -117,4 +117,7 @@ int mei_cldev_enable(struct mei_cl_device *cldev); int mei_cldev_disable(struct mei_cl_device *cldev); bool mei_cldev_enabled(const struct mei_cl_device *cldev); +void *mei_cldev_dma_map(struct mei_cl_device *cldev, u8 buffer_id, size_t size); +int mei_cldev_dma_unmap(struct mei_cl_device *cldev); + #endif /* _LINUX_MEI_CL_BUS_H */ -- cgit v1.2.3 From 75c5bd68b699bbcb6d25879644d62de4da14ab92 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Fri, 26 Nov 2021 10:48:19 +0100 Subject: ieee80211: change HE nominal packet padding value defines It's easier to use and understand, and to extend for EHT later, if we use the values here instead of the shifted values. Unfortunately, we need to add _POS so that we can use it in places like iwlwifi/mvm where constants are needed. While at it, fix the typo ("NOMIMAL") which also helps catch any conflicts. Signed-off-by: Miri Korenblit Link: https://lore.kernel.org/r/20211126104817.7c29a05b8eb5.I2ca9faf06e177e3035bec91e2ae53c2f91d41774@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 11d7af260f20..559b6c644938 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2258,11 +2258,12 @@ enum ieee80211_client_reg_power { #define IEEE80211_HE_PHY_CAP9_RX_1024_QAM_LESS_THAN_242_TONE_RU 0x08 #define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB 0x10 #define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB 0x20 -#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_0US 0x00 -#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_8US 0x40 -#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_16US 0x80 -#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_RESERVED 0xc0 -#define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_MASK 0xc0 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_0US 0x0 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_8US 0x1 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_16US 0x2 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_RESERVED 0x3 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_POS 6 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_MASK 0xc0 #define IEEE80211_HE_PHY_CAP10_HE_MU_M1RU_MAX_LTF 0x01 -- cgit v1.2.3 From b99658452355d316debee11079e8f1c6c1029355 Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Sun, 28 Nov 2021 17:57:37 -0800 Subject: net: dsa: ocelot: felix: utilize shared mscc-miim driver for indirect MDIO access Switch to a shared MDIO access implementation by way of the mdio-mscc-miim driver. Signed-off-by: Colin Foster Tested-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/mdio/mdio-mscc-miim.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/linux/mdio/mdio-mscc-miim.h (limited to 'include/linux') diff --git a/include/linux/mdio/mdio-mscc-miim.h b/include/linux/mdio/mdio-mscc-miim.h new file mode 100644 index 000000000000..5b4ed2c3cbb9 --- /dev/null +++ b/include/linux/mdio/mdio-mscc-miim.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ +/* + * Driver for the MDIO interface of Microsemi network switches. + * + * Author: Colin Foster + * Copyright (C) 2021 Innovative Advantage + */ +#ifndef MDIO_MSCC_MIIM_H +#define MDIO_MSCC_MIIM_H + +#include +#include +#include + +int mscc_miim_setup(struct device *device, struct mii_bus **bus, + const char *name, struct regmap *mii_regmap, + int status_offset); + +#endif -- cgit v1.2.3 From 4047b9db1aa7512a10ba3560a3f63821c8c40235 Mon Sep 17 00:00:00 2001 From: Bhupesh Sharma Date: Mon, 29 Nov 2021 01:28:54 +0530 Subject: net: stmmac: Add platform level debug register dump feature dwmac-qcom-ethqos currently exposes a mechanism to dump rgmii registers after the 'stmmac_dvr_probe()' returns. However with commit 5ec55823438e ("net: stmmac: add clocks management for gmac driver"), we now let 'pm_runtime_put()' disable the clocks before returning from 'stmmac_dvr_probe()'. This causes a crash when 'rgmii_dump()' register dumps are enabled, as the clocks are already off. Since other dwmac drivers (possible future users as well) might require a similar register dump feature, introduce a platform level callback to allow the same. This fixes the crash noticed while enabling rgmii_dump() dumps in dwmac-qcom-ethqos driver as well. It also allows future changes to keep a invoking the register dump callback from the correct place inside 'stmmac_dvr_probe()'. Fixes: 5ec55823438e ("net: stmmac: add clocks management for gmac driver") Cc: Joakim Zhang Cc: David S. Miller Signed-off-by: Bhupesh Sharma Signed-off-by: David S. Miller --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 89b8e208cd7b..24eea1b05ca2 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -233,6 +233,7 @@ struct plat_stmmacenet_data { int (*clks_config)(void *priv, bool enabled); int (*crosststamp)(ktime_t *device, struct system_counterval_t *system, void *ctx); + void (*dump_debug_regs)(void *priv); void *bsp_priv; struct clk *stmmac_clk; struct clk *pclk; -- cgit v1.2.3 From e6f2dd0f80674e9d5960337b3e9c2a242441b326 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 29 Nov 2021 19:06:19 -0800 Subject: bpf: Add bpf_loop helper This patch adds the kernel-side and API changes for a new helper function, bpf_loop: long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags); where long (*callback_fn)(u32 index, void *ctx); bpf_loop invokes the "callback_fn" **nr_loops** times or until the callback_fn returns 1. The callback_fn can only return 0 or 1, and this is enforced by the verifier. The callback_fn index is zero-indexed. A few things to please note: ~ The "u64 flags" parameter is currently unused but is included in case a future use case for it arises. ~ In the kernel-side implementation of bpf_loop (kernel/bpf/bpf_iter.c), bpf_callback_t is used as the callback function cast. ~ A program can have nested bpf_loop calls but the program must still adhere to the verifier constraint of its stack depth (the stack depth cannot exceed MAX_BPF_STACK)) ~ Recursive callback_fns do not pass the verifier, due to the call stack for these being too deep. ~ The next patch will include the tests and benchmark Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211130030622.4131246-2-joannekoong@fb.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cc7a0c36e7df..cad0829710be 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2164,6 +2164,7 @@ extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto; extern const struct bpf_func_proto bpf_find_vma_proto; +extern const struct bpf_func_proto bpf_loop_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); -- cgit v1.2.3 From 8293eb995f349aed28006792cad4cb48091919dd Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:25 -0800 Subject: bpf: Rename btf_member accessors. Rename btf_member_bit_offset() and btf_member_bitfield_size() to avoid conflicts with similarly named helpers in libbpf's btf.h. Rename the kernel helpers, since libbpf helpers are part of uapi. Suggested-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-3-alexei.starovoitov@gmail.com --- include/linux/btf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 203eef993d76..956f70388f69 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -194,15 +194,15 @@ static inline bool btf_type_kflag(const struct btf_type *t) return BTF_INFO_KFLAG(t->info); } -static inline u32 btf_member_bit_offset(const struct btf_type *struct_type, - const struct btf_member *member) +static inline u32 __btf_member_bit_offset(const struct btf_type *struct_type, + const struct btf_member *member) { return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset) : member->offset; } -static inline u32 btf_member_bitfield_size(const struct btf_type *struct_type, - const struct btf_member *member) +static inline u32 __btf_member_bitfield_size(const struct btf_type *struct_type, + const struct btf_member *member) { return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset) : 0; -- cgit v1.2.3 From 29db4bea1d10b73749d7992c1fc9ac13499e8871 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:26 -0800 Subject: bpf: Prepare relo_core.c for kernel duty. Make relo_core.c to be compiled for the kernel and for user space libbpf. Note the patch is reducing BPF_CORE_SPEC_MAX_LEN from 64 to 32. This is the maximum number of nested structs and arrays. For example: struct sample { int a; struct { int b[10]; }; }; struct sample *s = ...; int *y = &s->b[5]; This field access is encoded as "0:1:0:5" and spec len is 4. The follow up patch might bump it back to 64. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-4-alexei.starovoitov@gmail.com --- include/linux/btf.h | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 956f70388f69..acef6ef28768 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -144,6 +144,53 @@ static inline bool btf_type_is_enum(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM; } +static inline bool str_is_empty(const char *s) +{ + return !s || !s[0]; +} + +static inline u16 btf_kind(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info); +} + +static inline bool btf_is_enum(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ENUM; +} + +static inline bool btf_is_composite(const struct btf_type *t) +{ + u16 kind = btf_kind(t); + + return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; +} + +static inline bool btf_is_array(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ARRAY; +} + +static inline bool btf_is_int(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_INT; +} + +static inline bool btf_is_ptr(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_PTR; +} + +static inline u8 btf_int_offset(const struct btf_type *t) +{ + return BTF_INT_OFFSET(*(u32 *)(t + 1)); +} + +static inline u8 btf_int_encoding(const struct btf_type *t) +{ + return BTF_INT_ENCODING(*(u32 *)(t + 1)); +} + static inline bool btf_type_is_scalar(const struct btf_type *t) { return btf_type_is_int(t) || btf_type_is_enum(t); @@ -184,6 +231,11 @@ static inline u16 btf_type_vlen(const struct btf_type *t) return BTF_INFO_VLEN(t->info); } +static inline u16 btf_vlen(const struct btf_type *t) +{ + return btf_type_vlen(t); +} + static inline u16 btf_func_linkage(const struct btf_type *t) { return BTF_INFO_VLEN(t->info); @@ -208,11 +260,40 @@ static inline u32 __btf_member_bitfield_size(const struct btf_type *struct_type, : 0; } +static inline struct btf_member *btf_members(const struct btf_type *t) +{ + return (struct btf_member *)(t + 1); +} + +static inline u32 btf_member_bit_offset(const struct btf_type *t, u32 member_idx) +{ + const struct btf_member *m = btf_members(t) + member_idx; + + return __btf_member_bit_offset(t, m); +} + +static inline u32 btf_member_bitfield_size(const struct btf_type *t, u32 member_idx) +{ + const struct btf_member *m = btf_members(t) + member_idx; + + return __btf_member_bitfield_size(t, m); +} + static inline const struct btf_member *btf_type_member(const struct btf_type *t) { return (const struct btf_member *)(t + 1); } +static inline struct btf_array *btf_array(const struct btf_type *t) +{ + return (struct btf_array *)(t + 1); +} + +static inline struct btf_enum *btf_enum(const struct btf_type *t) +{ + return (struct btf_enum *)(t + 1); +} + static inline const struct btf_var_secinfo *btf_type_var_secinfo( const struct btf_type *t) { -- cgit v1.2.3 From fbd94c7afcf99c9f3b1ba1168657ecc428eb2c8d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:28 -0800 Subject: bpf: Pass a set of bpf_core_relo-s to prog_load command. struct bpf_core_relo is generated by llvm and processed by libbpf. It's a de-facto uapi. With CO-RE in the kernel the struct bpf_core_relo becomes uapi de-jure. Add an ability to pass a set of 'struct bpf_core_relo' to prog_load command and let the kernel perform CO-RE relocations. Note the struct bpf_line_info and struct bpf_func_info have the same layout when passed from LLVM to libbpf and from libbpf to the kernel except "insn_off" fields means "byte offset" when LLVM generates it. Then libbpf converts it to "insn index" to pass to the kernel. The struct bpf_core_relo's "insn_off" field is always "byte offset". Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-6-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cad0829710be..8bbf08fbab66 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1732,6 +1732,14 @@ bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); const struct btf_func_model * bpf_jit_find_kfunc_model(const struct bpf_prog *prog, const struct bpf_insn *insn); +struct bpf_core_ctx { + struct bpf_verifier_log *log; + const struct btf *btf; +}; + +int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, + int relo_idx, void *insn); + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { -- cgit v1.2.3 From b247f32aecad09e6cf7edff7739e6f2c9dc5fca9 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Thu, 28 Oct 2021 16:03:06 +0300 Subject: net/mlx5: Dynamically resize flow counters query buffer The flow counters bulk query buffer is allocated once during mlx5_fc_init_stats(). For PFs and VFs this buffer usually takes a little more than 512KB of memory, which is aligned to the next power of 2, to 1MB. For SFs, this buffer is reduced and takes around 128 Bytes. The buffer size determines the maximum number of flow counters that can be queried at a time. Thus, having a bigger buffer can improve performance for users that need to query many flow counters. There are cases that don't use many flow counters and don't need a big buffer (e.g. SFs, VFs). Since this size is critical with large scale, in these cases the buffer size should be reduced. In order to reduce memory consumption while maintaining query performance, change the query buffer's allocation scheme to the following: - First allocate the buffer with small initial size. - If the number of counters surpasses the initial size, resize the buffer to the maximum size. The buffer only grows and isn't shrank, because users with many flow counters don't care about the buffer size and we don't want to add resize overhead if the current number of counters drops. This solution is preferable to the current one, which is less accurate and only addresses SFs. Signed-off-by: Avihai Horon Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index a623ec635947..78655d8d13a7 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -478,6 +478,10 @@ struct mlx5_fc_stats { unsigned long next_query; unsigned long sampling_interval; /* jiffies */ u32 *bulk_query_out; + int bulk_query_len; + size_t num_counters; + bool bulk_query_alloc_failed; + unsigned long next_bulk_query_alloc; struct mlx5_fc_pool fc_pool; }; -- cgit v1.2.3 From 0cc3a8017900f856f9bf4fdc41c2b5cb1670aabe Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Thu, 2 Dec 2021 13:01:56 -0800 Subject: qed*: enhance tx timeout debug info This patch add some new qed APIs to query status block info and report various data to MFW on tx timeout event Along with that it enhances qede to dump more debug logs (not just specific to the queue which was reported by stack) on tx timeout which includes various other basic metadata about all tx queues and other info (like status block etc.) Signed-off-by: Manish Chopra Signed-off-by: Prabhakar Kushwaha Signed-off-by: Alok Prasad Signed-off-by: Ariel Elior Signed-off-by: Jakub Kicinski --- include/linux/qed/qed_if.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 0dae7fcc5ef2..9f4bfa2a4829 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -807,6 +807,12 @@ struct qed_devlink { struct devlink_health_reporter *fw_reporter; }; +struct qed_sb_info_dbg { + u32 igu_prod; + u32 igu_cons; + u16 pi[PIS_PER_SB]; +}; + struct qed_common_cb_ops { void (*arfs_filter_op)(void *dev, void *fltr, u8 fw_rc); void (*link_update)(void *dev, struct qed_link_output *link); @@ -1194,6 +1200,11 @@ struct qed_common_ops { struct devlink* (*devlink_register)(struct qed_dev *cdev); void (*devlink_unregister)(struct devlink *devlink); + + __printf(2, 3) void (*mfw_report)(struct qed_dev *cdev, char *fmt, ...); + + int (*get_sb_info)(struct qed_dev *cdev, struct qed_sb_info *sb, + u16 qid, struct qed_sb_info_dbg *sb_dbg); }; #define MASK_FIELD(_name, _value) \ -- cgit v1.2.3 From 823163ba6e52e644be5df4539a19e3df8d0988dd Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Thu, 2 Dec 2021 13:01:57 -0800 Subject: qed*: esl priv flag support through ethtool ESL(Enhanced System Lockdown) was designed to lock PCI adapter firmware images and prevent changes to critical non-volatile configuration data so that uncontrolled, malicious or unintentional modification to the adapters are avoided, ensuring it's operational state. Once this feature is enabled, the device is locked, rejecting any modification to non-volatile images. Once unlocked, the protection is off such that firmware and non-volatile configurations may be altered. Driver just reflects the capability and status of this through the ethtool private flag. Signed-off-by: Manish Chopra Signed-off-by: Prabhakar Kushwaha Signed-off-by: Alok Prasad Signed-off-by: Ariel Elior Signed-off-by: Jakub Kicinski --- include/linux/qed/qed_if.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 9f4bfa2a4829..6dc4943d8aec 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -652,6 +652,7 @@ struct qed_dev_info { bool wol_support; bool smart_an; + bool esl; /* MBI version */ u32 mbi_version; @@ -1205,6 +1206,8 @@ struct qed_common_ops { int (*get_sb_info)(struct qed_dev *cdev, struct qed_sb_info *sb, u16 qid, struct qed_sb_info_dbg *sb_dbg); + + int (*get_esl_status)(struct qed_dev *cdev, bool *esl_active); }; #define MASK_FIELD(_name, _value) \ -- cgit v1.2.3 From 866de407444398bc8140ea70de1dba5f91cc34ac Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 3 Dec 2021 13:30:01 +0800 Subject: bpf: Disallow BPF_LOG_KERNEL log level for bpf(BPF_BTF_LOAD) BPF_LOG_KERNEL is only used internally, so disallow bpf_btf_load() to set log level as BPF_LOG_KERNEL. The same checking has already been done in bpf_check(), so factor out a helper to check the validity of log attributes and use it in both places. Fixes: 8580ac9404f6 ("bpf: Process in-kernel BTF") Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20211203053001.740945-1-houtao1@huawei.com --- include/linux/bpf_verifier.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c8a78e830fca..182b16a91084 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -396,6 +396,13 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) log->level == BPF_LOG_KERNEL); } +static inline bool +bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) +{ + return log->len_total >= 128 && log->len_total <= UINT_MAX >> 2 && + log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK); +} + #define BPF_MAX_SUBPROGS 256 struct bpf_subprog_info { -- cgit v1.2.3 From 4e66934eaadc83b27ada8d42b60894018f3bfabf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:21:55 -0800 Subject: lib: add reference counting tracking infrastructure It can be hard to track where references are taken and released. In networking, we have annoying issues at device or netns dismantles, and we had various proposals to ease root causing them. This patch adds new infrastructure pairing refcount increases and decreases. This will self document code, because programmers will have to associate increments/decrements. This is controled by CONFIG_REF_TRACKER which can be selected by users of this feature. This adds both cpu and memory costs, and thus should probably be used with care. Signed-off-by: Eric Dumazet Reviewed-by: Dmitry Vyukov Signed-off-by: Jakub Kicinski --- include/linux/ref_tracker.h | 73 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 include/linux/ref_tracker.h (limited to 'include/linux') diff --git a/include/linux/ref_tracker.h b/include/linux/ref_tracker.h new file mode 100644 index 000000000000..c11c9db5825c --- /dev/null +++ b/include/linux/ref_tracker.h @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#ifndef _LINUX_REF_TRACKER_H +#define _LINUX_REF_TRACKER_H +#include +#include +#include + +struct ref_tracker; + +struct ref_tracker_dir { +#ifdef CONFIG_REF_TRACKER + spinlock_t lock; + unsigned int quarantine_avail; + refcount_t untracked; + struct list_head list; /* List of active trackers */ + struct list_head quarantine; /* List of dead trackers */ +#endif +}; + +#ifdef CONFIG_REF_TRACKER +static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir, + unsigned int quarantine_count) +{ + INIT_LIST_HEAD(&dir->list); + INIT_LIST_HEAD(&dir->quarantine); + spin_lock_init(&dir->lock); + dir->quarantine_avail = quarantine_count; + refcount_set(&dir->untracked, 1); +} + +void ref_tracker_dir_exit(struct ref_tracker_dir *dir); + +void ref_tracker_dir_print(struct ref_tracker_dir *dir, + unsigned int display_limit); + +int ref_tracker_alloc(struct ref_tracker_dir *dir, + struct ref_tracker **trackerp, gfp_t gfp); + +int ref_tracker_free(struct ref_tracker_dir *dir, + struct ref_tracker **trackerp); + +#else /* CONFIG_REF_TRACKER */ + +static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir, + unsigned int quarantine_count) +{ +} + +static inline void ref_tracker_dir_exit(struct ref_tracker_dir *dir) +{ +} + +static inline void ref_tracker_dir_print(struct ref_tracker_dir *dir, + unsigned int display_limit) +{ +} + +static inline int ref_tracker_alloc(struct ref_tracker_dir *dir, + struct ref_tracker **trackerp, + gfp_t gfp) +{ + return 0; +} + +static inline int ref_tracker_free(struct ref_tracker_dir *dir, + struct ref_tracker **trackerp) +{ + return 0; +} + +#endif + +#endif /* _LINUX_REF_TRACKER_H */ -- cgit v1.2.3 From 4d92b95ff2f95f13df9bad0b5a25a9f60e72758d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:21:57 -0800 Subject: net: add net device refcount tracker infrastructure net device are refcounted. Over the years we had numerous bugs caused by imbalanced dev_hold() and dev_put() calls. The general idea is to be able to precisely pair each decrement with a corresponding prior increment. Both share a cookie, basically a pointer to private data storing stack traces. This patch adds dev_hold_track() and dev_put_track(). To use these helpers, each data structure owning a refcount should also use a "netdevice_tracker" to pair the hold and put. netdevice_tracker dev_tracker; ... dev_hold_track(dev, &dev_tracker, GFP_ATOMIC); ... dev_put_track(dev, &dev_tracker); Whenever a leak happens, we will get precise stack traces of the point dev_hold_track() happened, at device dismantle phase. We will also get a stack trace if too many dev_put_track() for the same netdevice_tracker are attempted. This is guarded by CONFIG_NET_DEV_REFCNT_TRACKER option. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 65117f01d5f2..143d60ed0047 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -48,6 +48,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -300,6 +301,12 @@ enum netdev_state_t { }; +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER +typedef struct ref_tracker *netdevice_tracker; +#else +typedef struct {} netdevice_tracker; +#endif + struct gro_list { struct list_head list; int count; @@ -1865,6 +1872,7 @@ enum netdev_ml_priv_type { * @proto_down_reason: reason a netdev interface is held down * @pcpu_refcnt: Number of references to this device * @dev_refcnt: Number of references to this device + * @refcnt_tracker: Tracker directory for tracked references to this device * @todo_list: Delayed register/unregister * @link_watch_list: XXX: need comments on this one * @@ -2178,6 +2186,7 @@ struct net_device { #else refcount_t dev_refcnt; #endif + struct ref_tracker_dir refcnt_tracker; struct list_head link_watch_list; @@ -3805,6 +3814,7 @@ void netdev_run_todo(void); * @dev: network device * * Release reference to device to allow it to be freed. + * Try using dev_put_track() instead. */ static inline void dev_put(struct net_device *dev) { @@ -3822,6 +3832,7 @@ static inline void dev_put(struct net_device *dev) * @dev: network device * * Hold reference to device to keep it from being freed. + * Try using dev_hold_track() instead. */ static inline void dev_hold(struct net_device *dev) { @@ -3834,6 +3845,40 @@ static inline void dev_hold(struct net_device *dev) } } +static inline void netdev_tracker_alloc(struct net_device *dev, + netdevice_tracker *tracker, gfp_t gfp) +{ +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + ref_tracker_alloc(&dev->refcnt_tracker, tracker, gfp); +#endif +} + +static inline void netdev_tracker_free(struct net_device *dev, + netdevice_tracker *tracker) +{ +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + ref_tracker_free(&dev->refcnt_tracker, tracker); +#endif +} + +static inline void dev_hold_track(struct net_device *dev, + netdevice_tracker *tracker, gfp_t gfp) +{ + if (dev) { + dev_hold(dev); + netdev_tracker_alloc(dev, tracker, gfp); + } +} + +static inline void dev_put_track(struct net_device *dev, + netdevice_tracker *tracker) +{ + if (dev) { + netdev_tracker_free(dev, tracker); + dev_put(dev); + } +} + /* Carrier loss detection, dial on demand. The functions netif_carrier_on * and _off may be called from IRQ context, but it is caller * who is responsible for serialization of these calls. -- cgit v1.2.3 From 80e8921b2b72c300ca56a01729004d30bedb82cd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:21:58 -0800 Subject: net: add net device refcount tracker to struct netdev_rx_queue This helps debugging net device refcount leaks. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 143d60ed0047..3d691fadd569 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -741,6 +741,8 @@ struct netdev_rx_queue { #endif struct kobject kobj; struct net_device *dev; + netdevice_tracker dev_tracker; + #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif -- cgit v1.2.3 From 0b688f24b7d611db3a02f3d4ab562d049c78a17d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:21:59 -0800 Subject: net: add net device refcount tracker to struct netdev_queue This will help debugging pesky netdev reference leaks. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3d691fadd569..b4f704337f65 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -586,6 +586,8 @@ struct netdev_queue { * read-mostly part */ struct net_device *dev; + netdevice_tracker dev_tracker; + struct Qdisc __rcu *qdisc; struct Qdisc *qdisc_sleeping; #ifdef CONFIG_SYSFS -- cgit v1.2.3 From 9038c320001dd07f60736018edf608ac5baca0ab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:22:03 -0800 Subject: net: dst: add net device refcount tracking to dst_entry We want to track all dev_hold()/dev_put() to ease leak hunting. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b4f704337f65..afed3b10491b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3883,6 +3883,23 @@ static inline void dev_put_track(struct net_device *dev, } } +static inline void dev_replace_track(struct net_device *odev, + struct net_device *ndev, + netdevice_tracker *tracker, + gfp_t gfp) +{ +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + if (odev) + ref_tracker_free(&odev->refcnt_tracker, tracker); +#endif + dev_hold(ndev); + dev_put(odev); +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + if (ndev) + ref_tracker_alloc(&ndev->refcnt_tracker, tracker, gfp); +#endif +} + /* Carrier loss detection, dial on demand. The functions netif_carrier_on * and _off may be called from IRQ context, but it is caller * who is responsible for serialization of these calls. -- cgit v1.2.3 From c04438f58d140723e58050fcb9d33d84cb39e9e9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:22:12 -0800 Subject: ipv4: add net device refcount tracker to struct in_device Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/inetdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 518b484a7f07..674aeead6260 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -24,6 +24,8 @@ struct ipv4_devconf { struct in_device { struct net_device *dev; + netdevice_tracker dev_tracker; + refcount_t refcnt; int dead; struct in_ifaddr __rcu *ifa_list;/* IP ifaddr chain */ -- cgit v1.2.3 From 63f13937cbe9b00982dfc8e578b1aec8e5037333 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:22:14 -0800 Subject: net: linkwatch: add net device refcount tracker Add a netdevice_tracker inside struct net_device, to track the self reference when a device is in lweventlist. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index afed3b10491b..69dca1edd5a6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1950,6 +1950,7 @@ enum netdev_ml_priv_type { * keep a list of interfaces to be deleted. * * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. + * @linkwatch_dev_tracker: refcount tracker used by linkwatch. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2280,6 +2281,7 @@ struct net_device { struct bpf_xdp_entity xdp_state[__MAX_XDP_MODE]; u8 dev_addr_shadow[MAX_ADDR_LEN]; + netdevice_tracker linkwatch_dev_tracker; }; #define to_net_dev(d) container_of(d, struct net_device, dev) -- cgit v1.2.3 From 42120a86438379eb77424831ae3d696c2d5cb622 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:22:16 -0800 Subject: ipmr, ip6mr: add net device refcount tracker to struct vif_device Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 8071148f29a6..e05ee9f001ff 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -12,6 +12,7 @@ /** * struct vif_device - interface representor for multicast routing * @dev: network device being used + * @dev_tracker: refcount tracker for @dev reference * @bytes_in: statistic; bytes ingressing * @bytes_out: statistic; bytes egresing * @pkt_in: statistic; packets ingressing @@ -26,6 +27,7 @@ */ struct vif_device { struct net_device *dev; + netdevice_tracker dev_tracker; unsigned long bytes_in, bytes_out; unsigned long pkt_in, pkt_out; unsigned long rate_limit; -- cgit v1.2.3 From 5fa5ae605821e0e10ee489d9a6e331fd287ccc57 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:22:17 -0800 Subject: netpoll: add net device refcount tracker to struct netpoll Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netpoll.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index e6a2d72e0dc7..bd19c4b91e31 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -24,6 +24,7 @@ union inet_addr { struct netpoll { struct net_device *dev; + netdevice_tracker dev_tracker; char dev_name[IFNAMSIZ]; const char *name; -- cgit v1.2.3 From 45cac6754529ae17345d8f5b632d9e602a091a20 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Dec 2021 20:53:56 -0800 Subject: net: fix recent csum changes Vladimir reported csum issues after my recent change in skb_postpull_rcsum() Issue here is the following: initial skb->csum is the csum of [part to be pulled][rest of packet] Old code: skb->csum = csum_sub(skb->csum, csum_partial(pull, pull_length, 0)); New code: skb->csum = ~csum_partial(pull, pull_length, ~skb->csum); This is broken if the csum of [pulled part] happens to be equal to skb->csum, because end result of skb->csum is 0 in new code, instead of being 0xffffffff David Laight suggested to use skb->csum = -csum_partial(pull, pull_length, -skb->csum); I based my patches on existing code present in include/net/seg6.h, update_csum_diff4() and update_csum_diff16() which might need a similar fix. I guess that my tests, mostly pulling 40 bytes of IPv6 header were not providing enough entropy to hit this bug. v2: added wsum_negate() to make sparse happy. Fixes: 29c3002644bd ("net: optimize skb_postpull_rcsum()") Fixes: 0bd28476f636 ("gro: optimize skb_gro_postpull_rcsum()") Signed-off-by: Eric Dumazet Reported-by: Vladimir Oltean Suggested-by: David Laight Cc: David Lebrun Tested-by: Vladimir Oltean Link: https://lore.kernel.org/r/20211204045356.3659278-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index eae4bd3237a4..dd262bd8ddbe 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3486,7 +3486,8 @@ static inline void skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = ~csum_partial(start, len, ~skb->csum); + skb->csum = wsum_negate(csum_partial(start, len, + wsum_negate(skb->csum))); else if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start_offset(skb) < 0) skb->ip_summed = CHECKSUM_NONE; -- cgit v1.2.3 From 13244cccc2b61ec715f0ac583d3037497004d4a5 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 1 Dec 2021 10:54:52 -0800 Subject: skbuff: introduce skb_pull_data Like skb_pull but returns the original data pointer before pulling the data after performing a check against sbk->len. This allows to change code that does "struct foo *p = (void *)skb->data;" which is hard to audit and error prone, to: p = skb_pull_data(skb, sizeof(*p)); if (!p) return; Which is both safer and cleaner. Acked-by: Jakub Kicinski Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Dan Carpenter Signed-off-by: Marcel Holtmann --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index eba256af64a5..877dda38684a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2373,6 +2373,8 @@ static inline void *skb_pull_inline(struct sk_buff *skb, unsigned int len) return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len); } +void *skb_pull_data(struct sk_buff *skb, size_t len); + void *__pskb_pull_tail(struct sk_buff *skb, int delta); static inline void *__pskb_pull(struct sk_buff *skb, unsigned int len) -- cgit v1.2.3 From 6fadaa565882cd7afc501de5921db6f5e45c784b Mon Sep 17 00:00:00 2001 From: Maxim Galaganov Date: Fri, 3 Dec 2021 14:35:39 -0800 Subject: tcp: expose __tcp_sock_set_cork and __tcp_sock_set_nodelay Expose __tcp_sock_set_cork() and __tcp_sock_set_nodelay() for use in MPTCP setsockopt code -- namely for syncing MPTCP socket options with subflows inside sync_socket_options() while already holding the subflow socket lock. Acked-by: Paolo Abeni Acked-by: Matthieu Baerts Signed-off-by: Maxim Galaganov Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 48d8a363319e..78b91bb92f0d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -512,11 +512,13 @@ static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss) int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); +void __tcp_sock_set_cork(struct sock *sk, bool on); void tcp_sock_set_cork(struct sock *sk, bool on); int tcp_sock_set_keepcnt(struct sock *sk, int val); int tcp_sock_set_keepidle_locked(struct sock *sk, int val); int tcp_sock_set_keepidle(struct sock *sk, int val); int tcp_sock_set_keepintvl(struct sock *sk, int val); +void __tcp_sock_set_nodelay(struct sock *sk, bool on); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); -- cgit v1.2.3 From 08f0b22d731fa86957749c649d6ef6ebc07e8ad2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Dec 2021 17:30:27 -0800 Subject: net: eql: add net device refcount tracker Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/if_eql.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/if_eql.h b/include/linux/if_eql.h index d984694c384d..d75601d613cc 100644 --- a/include/linux/if_eql.h +++ b/include/linux/if_eql.h @@ -26,6 +26,7 @@ typedef struct slave { struct list_head list; struct net_device *dev; + netdevice_tracker dev_tracker; long priority; long priority_bps; long priority_Bps; -- cgit v1.2.3 From 19c9ebf6ed70856385296a65e78c1699081b152f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Dec 2021 17:30:28 -0800 Subject: vlan: add net device refcount tracker Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/if_vlan.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 41a518336673..8420fe504927 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -162,6 +162,7 @@ struct netpoll; * @vlan_id: VLAN identifier * @flags: device flags * @real_dev: underlying netdevice + * @dev_tracker: refcount tracker for @real_dev reference * @real_dev_addr: address of underlying netdevice * @dent: proc dir entry * @vlan_pcpu_stats: ptr to percpu rx stats @@ -177,6 +178,8 @@ struct vlan_dev_priv { u16 flags; struct net_device *real_dev; + netdevice_tracker dev_tracker; + unsigned char real_dev_addr[ETH_ALEN]; struct proc_dir_entry *dent; -- cgit v1.2.3 From f12bf6f3f942b37de65eeea8be25903587fec930 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Dec 2021 17:30:30 -0800 Subject: net: watchdog: add net device refcount tracker Add a netdevice_tracker inside struct net_device, to track the self reference when a device has an active watchdog timer. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 69dca1edd5a6..1a748ee9a421 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1951,6 +1951,7 @@ enum netdev_ml_priv_type { * * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. * @linkwatch_dev_tracker: refcount tracker used by linkwatch. + * @watchdog_dev_tracker: refcount tracker used by watchdog. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2282,6 +2283,7 @@ struct net_device { u8 dev_addr_shadow[MAX_ADDR_LEN]; netdevice_tracker linkwatch_dev_tracker; + netdevice_tracker watchdog_dev_tracker; }; #define to_net_dev(d) container_of(d, struct net_device, dev) -- cgit v1.2.3 From 330c6d3bfa268794bf692165d0f781f1c2d4d83e Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 24 Nov 2021 10:45:36 +0900 Subject: can: bittiming: replace CAN units with the generic ones from linux/units.h In [1], we introduced a set of units in linux/can/bittiming.h. Since then, generic SI prefixes were added to linux/units.h in [2]. Those new prefixes can perfectly replace CAN specific ones. This patch replaces all occurrences of the CAN units with their corresponding prefix (from linux/units) and the unit (as a comment) according to below table. CAN units SI metric prefix (from linux/units) + unit (as a comment) ------------------------------------------------------------------------ CAN_KBPS KILO /* BPS */ CAN_MBPS MEGA /* BPS */ CAM_MHZ MEGA /* Hz */ The definition are then removed from linux/can/bittiming.h [1] commit 1d7750760b70 ("can: bittiming: add CAN_KBPS, CAN_MBPS and CAN_MHZ macros") [2] commit 26471d4a6cf8 ("units: Add SI metric prefix definitions") Link: https://lore.kernel.org/all/20211124014536.782550-1-mailhol.vincent@wanadoo.fr Suggested-by: Jimmy Assarsson Suggested-by: Oliver Hartkopp Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- include/linux/can/bittiming.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index 20b50baf3a02..a81652d1c6f3 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -12,13 +12,6 @@ #define CAN_SYNC_SEG 1 -/* Kilobits and Megabits per second */ -#define CAN_KBPS 1000UL -#define CAN_MBPS 1000000UL - -/* Megahertz */ -#define CAN_MHZ 1000000UL - #define CAN_CTRLMODE_TDC_MASK \ (CAN_CTRLMODE_TDC_AUTO | CAN_CTRLMODE_TDC_MANUAL) -- cgit v1.2.3 From 3f9bb0301d50ce27421eff4b710c2bbe58111a83 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 6 Dec 2021 18:57:47 +0200 Subject: net: dsa: make dp->bridge_num one-based MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I have seen too many bugs already due to the fact that we must encode an invalid dp->bridge_num as a negative value, because the natural tendency is to check that invalid value using (!dp->bridge_num). Latest example can be seen in commit 1bec0f05062c ("net: dsa: fix bridge_num not getting cleared after ports leaving the bridge"). Convert the existing users to assume that dp->bridge_num == 0 is the encoding for invalid, and valid bridge numbers start from 1. Signed-off-by: Vladimir Oltean Reviewed-by: Alvin Šipraga Signed-off-by: Jakub Kicinski --- include/linux/dsa/8021q.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 254b165f2b44..0af4371fbebb 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -38,13 +38,13 @@ void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id); int dsa_tag_8021q_bridge_tx_fwd_offload(struct dsa_switch *ds, int port, struct net_device *br, - int bridge_num); + unsigned int bridge_num); void dsa_tag_8021q_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port, struct net_device *br, - int bridge_num); + unsigned int bridge_num); -u16 dsa_8021q_bridge_tx_fwd_offload_vid(int bridge_num); +u16 dsa_8021q_bridge_tx_fwd_offload_vid(unsigned int bridge_num); u16 dsa_tag_8021q_tx_vid(const struct dsa_port *dp); -- cgit v1.2.3 From d3eed0e57d5d1bcbf1bd60f83a4adfe7d7b8dd9c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 6 Dec 2021 18:57:56 +0200 Subject: net: dsa: keep the bridge_dev and bridge_num as part of the same structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The main desire behind this is to provide coherent bridge information to the fast path without locking. For example, right now we set dp->bridge_dev and dp->bridge_num from separate code paths, it is theoretically possible for a packet transmission to read these two port properties consecutively and find a bridge number which does not correspond with the bridge device. Another desire is to start passing more complex bridge information to dsa_switch_ops functions. For example, with FDB isolation, it is expected that drivers will need to be passed the bridge which requested an FDB/MDB entry to be offloaded, and along with that bridge_dev, the associated bridge_num should be passed too, in case the driver might want to implement an isolation scheme based on that number. We already pass the {bridge_dev, bridge_num} pair to the TX forwarding offload switch API, however we'd like to remove that and squash it into the basic bridge join/leave API. So that means we need to pass this pair to the bridge join/leave API. During dsa_port_bridge_leave, first we unset dp->bridge_dev, then we call the driver's .port_bridge_leave with what used to be our dp->bridge_dev, but provided as an argument. When bridge_dev and bridge_num get folded into a single structure, we need to preserve this behavior in dsa_port_bridge_leave: we need a copy of what used to be in dp->bridge. Switch drivers check bridge membership by comparing dp->bridge_dev with the provided bridge_dev, but now, if we provide the struct dsa_bridge as a pointer, they cannot keep comparing dp->bridge to the provided pointer, since this only points to an on-stack copy. To make this obvious and prevent driver writers from forgetting and doing stupid things, in this new API, the struct dsa_bridge is provided as a full structure (not very large, contains an int and a pointer) instead of a pointer. An explicit comparison function needs to be used to determine bridge membership: dsa_port_offloads_bridge(). Signed-off-by: Vladimir Oltean Reviewed-by: Alvin Šipraga Signed-off-by: Jakub Kicinski --- include/linux/dsa/8021q.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 0af4371fbebb..939a1beaddf7 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -7,6 +7,7 @@ #include #include +#include struct dsa_switch; struct dsa_port; @@ -37,12 +38,10 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id); int dsa_tag_8021q_bridge_tx_fwd_offload(struct dsa_switch *ds, int port, - struct net_device *br, - unsigned int bridge_num); + struct dsa_bridge bridge); void dsa_tag_8021q_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port, - struct net_device *br, - unsigned int bridge_num); + struct dsa_bridge bridge); u16 dsa_8021q_bridge_tx_fwd_offload_vid(unsigned int bridge_num); -- cgit v1.2.3 From 283e6f5a8166f075eba78da6b867d76cc5d47e77 Mon Sep 17 00:00:00 2001 From: Sergey Ryazanov Date: Tue, 7 Dec 2021 12:21:40 +0300 Subject: net: wwan: make debugfs optional Debugfs interface is optional for the regular modem use. Some distros and users will want to disable this feature for security or kernel size reasons. So add a configuration option that allows to completely disable the debugfs interface of the WWAN devices. A primary considered use case for this option was embedded firmwares. For example, in OpenWrt, you can not completely disable debugfs, as a lot of wireless stuff can only be configured and monitored with the debugfs knobs. At the same time, reducing the size of a kernel and modules is an essential task in the world of embedded software. Disabling the WWAN and IOSM debugfs interfaces allows us to save 50K (x86-64 build) of space for module storage. Not much, but already considerable when you only have 16MB of storage. So it is hard to just disable whole debugfs. Users need some fine grained set of options to control which debugfs interface is important and should be available and which is not. The new configuration symbol is enabled by default and is hidden under the EXPERT option. So a regular user would not be bothered by another one configuration question. While an embedded distro maintainer will be able to a little more reduce the final image size. Signed-off-by: Sergey Ryazanov Reviewed-by: Leon Romanovsky Reviewed-by: Loic Poulain Acked-by: M Chetan Kumar Signed-off-by: Jakub Kicinski --- include/linux/wwan.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wwan.h b/include/linux/wwan.h index 1646aa3e6779..e143c88bf4b0 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -171,6 +171,13 @@ int wwan_register_ops(struct device *parent, const struct wwan_ops *ops, void wwan_unregister_ops(struct device *parent); +#ifdef CONFIG_WWAN_DEBUGFS struct dentry *wwan_get_debugfs_dir(struct device *parent); +#else +static inline struct dentry *wwan_get_debugfs_dir(struct device *parent) +{ + return ERR_PTR(-ENODEV); +} +#endif #endif /* __WWAN_H */ -- cgit v1.2.3 From 3e5b1feccea7db576353ffc302f78d522e4116e6 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 9 Dec 2021 13:11:32 +0000 Subject: net: phylink: add legacy_pre_march2020 indicator Add a boolean to phylink_config to indicate whether a driver has not been updated for the changes in commit 7cceb599d15d ("net: phylink: avoid mac_config calls"), and thus are reliant on the old behaviour. We were currently keying the phylink behaviour on the presence of a PCS, but this is sub-optimal for modern drivers that may not have a PCS. This commit merely introduces the new flag, but does not add any use, since we need all legacy drivers to set this flag before it can be used. Once these legacy drivers have been updated, we can remove this flag. Signed-off-by: Russell King (Oracle) Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 01224235df0f..d005b8e36048 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -84,6 +84,8 @@ enum phylink_op_type { * struct phylink_config - PHYLINK configuration structure * @dev: a pointer to a struct device associated with the MAC * @type: operation type of PHYLINK instance + * @legacy_pre_march2020: driver has not been updated for March 2020 updates + * (See commit 7cceb599d15d ("net: phylink: avoid mac_config calls") * @pcs_poll: MAC PCS cannot provide link change interrupt * @poll_fixed_state: if true, starts link_poll, * if MAC link is at %MLO_AN_FIXED mode. @@ -97,6 +99,7 @@ enum phylink_op_type { struct phylink_config { struct device *dev; enum phylink_op_type type; + bool legacy_pre_march2020; bool pcs_poll; bool poll_fixed_state; bool ovr_an_inband; -- cgit v1.2.3 From 001f4261fe4d5ae710cf1f445b6cae6d9d3ae26e Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 9 Dec 2021 13:11:48 +0000 Subject: net: phylink: use legacy_pre_march2020 Use the legacy flag to indicate whether we should operate in legacy mode. This allows us to stop using the presence of a PCS as an indicator to the age of the phylink user, and make PCS presence optional. Legacy mode involves: 1) calling mac_config() whenever the link comes up 2) calling mac_config() whenever the inband advertisement changes, possibly followed by a call to mac_an_restart() 3) making use of mac_an_restart() 4) making use of mac_pcs_get_state() All the above functionality was moved to a seperate "PCS" block of operations in March 2020. Update the documents to indicate that the differences that this flag makes. Signed-off-by: Russell King (Oracle) Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index d005b8e36048..a2f266cc3442 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -190,6 +190,10 @@ void validate(struct phylink_config *config, unsigned long *supported, * negotiation completion state in @state->an_complete, and link up state * in @state->link. If possible, @state->lp_advertising should also be * populated. + * + * Note: This is a legacy method. This function will not be called unless + * legacy_pre_march2020 is set in &struct phylink_config and there is no + * PCS attached. */ void mac_pcs_get_state(struct phylink_config *config, struct phylink_link_state *state); @@ -230,6 +234,15 @@ int mac_prepare(struct phylink_config *config, unsigned int mode, * guaranteed to be correct, and so any mac_config() implementation must * never reference these fields. * + * Note: For legacy March 2020 drivers (drivers with legacy_pre_march2020 set + * in their &phylnk_config and which don't have a PCS), this function will be + * called on each link up event, and to also change the in-band advert. For + * non-legacy drivers, it will only be called to reconfigure the MAC for a + * "major" change in e.g. interface mode. It will not be called for changes + * in speed, duplex or pause modes or to change the in-band advertisement. + * In any case, it is strongly preferred that speed, duplex and pause settings + * are handled in the mac_link_up() method and not in this method. + * * (this requires a rewrite - please refer to mac_link_up() for situations * where the PCS and MAC are not tightly integrated.) * @@ -314,6 +327,10 @@ int mac_finish(struct phylink_config *config, unsigned int mode, /** * mac_an_restart() - restart 802.3z BaseX autonegotiation * @config: a pointer to a &struct phylink_config. + * + * Note: This is a legacy method. This function will not be called unless + * legacy_pre_march2020 is set in &struct phylink_config and there is no + * PCS attached. */ void mac_an_restart(struct phylink_config *config); -- cgit v1.2.3 From 1a2fb220edca98d18f90e3ef5bd6853a6b22b1b8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 6 Dec 2021 22:27:58 -0800 Subject: skbuff: Extract list pointers to silence compiler warnings Under both -Warray-bounds and the object_size sanitizer, the compiler is upset about accessing prev/next of sk_buff when the object it thinks it is coming from is sk_buff_head. The warning is a false positive due to the compiler taking a conservative approach, opting to warn at casting time rather than access time. However, in support of enabling -Warray-bounds globally (which has found many real bugs), arrange things for sk_buff so that the compiler can unambiguously see that there is no intention to access anything except prev/next. Introduce and cast to a separate struct sk_buff_list, which contains _only_ the first two fields, silencing the warnings: In file included from ./include/net/net_namespace.h:39, from ./include/linux/netdevice.h:37, from net/core/netpoll.c:17: net/core/netpoll.c: In function 'refill_skbs': ./include/linux/skbuff.h:2086:9: warning: array subscript 'struct sk_buff[0]' is partly outside array bounds of 'struct sk_buff_head[1]' [-Warray-bounds] 2086 | __skb_insert(newsk, next->prev, next, list); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ net/core/netpoll.c:49:28: note: while referencing 'skb_pool' 49 | static struct sk_buff_head skb_pool; | ^~~~~~~~ This change results in no executable instruction differences. Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20211207062758.2324338-1-keescook@chromium.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index dd262bd8ddbe..6535294f6a48 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -292,9 +292,11 @@ struct tc_skb_ext { #endif struct sk_buff_head { - /* These two members must be first. */ - struct sk_buff *next; - struct sk_buff *prev; + /* These two members must be first to match sk_buff. */ + struct_group_tagged(sk_buff_list, list, + struct sk_buff *next; + struct sk_buff *prev; + ); __u32 qlen; spinlock_t lock; @@ -730,7 +732,7 @@ typedef unsigned char *sk_buff_data_t; struct sk_buff { union { struct { - /* These two members must be first. */ + /* These two members must be first to match sk_buff_head. */ struct sk_buff *next; struct sk_buff *prev; @@ -1976,8 +1978,8 @@ static inline void __skb_insert(struct sk_buff *newsk, */ WRITE_ONCE(newsk->next, next); WRITE_ONCE(newsk->prev, prev); - WRITE_ONCE(next->prev, newsk); - WRITE_ONCE(prev->next, newsk); + WRITE_ONCE(((struct sk_buff_list *)next)->prev, newsk); + WRITE_ONCE(((struct sk_buff_list *)prev)->next, newsk); WRITE_ONCE(list->qlen, list->qlen + 1); } @@ -2073,7 +2075,7 @@ static inline void __skb_queue_after(struct sk_buff_head *list, struct sk_buff *prev, struct sk_buff *newsk) { - __skb_insert(newsk, prev, prev->next, list); + __skb_insert(newsk, prev, ((struct sk_buff_list *)prev)->next, list); } void skb_append(struct sk_buff *old, struct sk_buff *newsk, @@ -2083,7 +2085,7 @@ static inline void __skb_queue_before(struct sk_buff_head *list, struct sk_buff *next, struct sk_buff *newsk) { - __skb_insert(newsk, next->prev, next, list); + __skb_insert(newsk, ((struct sk_buff_list *)next)->prev, next, list); } /** -- cgit v1.2.3 From 9ba74e6c9e9d0c5c1e5792a7111fc7d1a0589cb8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Dec 2021 23:44:21 -0800 Subject: net: add networking namespace refcount tracker We have 100+ syzbot reports about netns being dismantled too soon, still unresolved as of today. We think a missing get_net() or an extra put_net() is the root cause. In order to find the bug(s), and be able to spot future ones, this patch adds CONFIG_NET_NS_REFCNT_TRACKER and new helpers to precisely pair all put_net() with corresponding get_net(). To use these helpers, each data structure owning a refcount should also use a "netns_tracker" to pair the get and put. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1a748ee9a421..235d5d082f1a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -48,7 +48,7 @@ #include #include #include -#include +#include struct netpoll_info; struct device; @@ -300,13 +300,6 @@ enum netdev_state_t { __LINK_STATE_TESTING, }; - -#ifdef CONFIG_NET_DEV_REFCNT_TRACKER -typedef struct ref_tracker *netdevice_tracker; -#else -typedef struct {} netdevice_tracker; -#endif - struct gro_list { struct list_head list; int count; -- cgit v1.2.3 From 04a931e58d1944ab3d1e11fdfde1947fbe5b6a37 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Dec 2021 23:44:23 -0800 Subject: net: add netns refcount tracker to struct seq_net_private Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/seq_file_net.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/seq_file_net.h b/include/linux/seq_file_net.h index 0fdbe1ddd8d1..b97912fdbae7 100644 --- a/include/linux/seq_file_net.h +++ b/include/linux/seq_file_net.h @@ -9,7 +9,8 @@ extern struct net init_net; struct seq_net_private { #ifdef CONFIG_NET_NS - struct net *net; + struct net *net; + netns_tracker ns_tracker; #endif }; -- cgit v1.2.3 From c5fb19937455095573a19ddcbff32e993ed10e35 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 10 Dec 2021 22:16:49 +0800 Subject: bpf: Add bpf_strncmp helper The helper compares two strings: one string is a null-terminated read-only string, and another string has const max storage size but doesn't need to be null-terminated. It can be used to compare file name in tracing or LSM program. Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211210141652.877186-2-houtao1@huawei.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0ceb54c6342f..7a40022e3d00 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2163,6 +2163,7 @@ extern const struct bpf_func_proto bpf_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto; extern const struct bpf_func_proto bpf_find_vma_proto; extern const struct bpf_func_proto bpf_loop_proto; +extern const struct bpf_func_proto bpf_strncmp_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); -- cgit v1.2.3 From 35d976802124303a5b3eb7ec3ed188d568204373 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 10 Dec 2021 01:34:38 +0200 Subject: net: dsa: tag_ocelot: convert to tagger-owned data The felix driver makes very light use of dp->priv, and the tagger is effectively stateless. dp->priv is practically only needed to set up a callback to perform deferred xmit of PTP and STP packets using the ocelot-8021q tagging protocol (the main ocelot tagging protocol makes no use of dp->priv, although this driver sets up dp->priv irrespective of actual tagging protocol in use). struct felix_port (what used to be pointed to by dp->priv) is removed and replaced with a two-sided structure. The public side of this structure, visible to the switch driver, is ocelot_8021q_tagger_data. The private side is ocelot_8021q_tagger_private, and the latter structure physically encapsulates the former. The public half of the tagger data structure can be accessed through a helper of the same name (ocelot_8021q_tagger_data) which also sanity-checks the protocol currently in use by the switch. The public/private split was requested by Andrew Lunn. Suggested-by: Andrew Lunn Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/ocelot.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index 7ee708ad7df2..dca2969015d8 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -8,6 +8,7 @@ #include #include #include +#include struct ocelot_skb_cb { struct sk_buff *clone; @@ -168,11 +169,18 @@ struct felix_deferred_xmit_work { struct kthread_work work; }; -struct felix_port { +struct ocelot_8021q_tagger_data { void (*xmit_work_fn)(struct kthread_work *work); - struct kthread_worker *xmit_worker; }; +static inline struct ocelot_8021q_tagger_data * +ocelot_8021q_tagger_data(struct dsa_switch *ds) +{ + BUG_ON(ds->dst->tag_ops->proto != DSA_TAG_PROTO_OCELOT_8021Q); + + return ds->tagger_data; +} + static inline void ocelot_xfh_get_rew_val(void *extraction, u64 *rew_val) { packing(extraction, rew_val, 116, 85, OCELOT_TAG_LEN, UNPACK, 0); -- cgit v1.2.3 From d38049bbe7601f38d598f5da5ff09980483b290a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 10 Dec 2021 01:34:40 +0200 Subject: net: dsa: sja1105: bring deferred xmit implementation in line with ocelot-8021q When the ocelot-8021q driver was converted to deferred xmit as part of commit 8d5f7954b7c8 ("net: dsa: felix: break at first CPU port during init and teardown"), the deferred implementation was deliberately made subtly different from what sja1105 has. The implementation differences lied on the following observations: - There might be a race between these two lines in tag_sja1105.c: skb_queue_tail(&sp->xmit_queue, skb_get(skb)); kthread_queue_work(sp->xmit_worker, &sp->xmit_work); and the skb dequeue logic in sja1105_port_deferred_xmit(). For example, the xmit_work might be already queued, however the work item has just finished walking through the skb queue. Because we don't check the return code from kthread_queue_work, we don't do anything if the work item is already queued. However, nobody will take that skb and send it, at least until the next timestampable skb is sent. This creates additional (and avoidable) TX timestamping latency. To close that race, what the ocelot-8021q driver does is it doesn't keep a single work item per port, and a skb timestamping queue, but rather dynamically allocates a work item per packet. - It is also unnecessary to have more than one kthread that does the work. So delete the per-port kthread allocations and replace them with a single kthread which is global to the switch. This change brings the two implementations in line by applying those observations to the sja1105 driver as well. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index e6c78be40bde..acd9d2afccab 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -37,6 +37,12 @@ #define SJA1105_HWTS_RX_EN 0 +struct sja1105_deferred_xmit_work { + struct dsa_port *dp; + struct sk_buff *skb; + struct kthread_work work; +}; + /* Global tagger data: each struct sja1105_port has a reference to * the structure defined in struct sja1105_private. */ @@ -52,6 +58,8 @@ struct sja1105_tagger_data { * 2-step TX timestamps */ struct sk_buff_head skb_txtstamp_queue; + struct kthread_worker *xmit_worker; + void (*xmit_work_fn)(struct kthread_work *work); }; struct sja1105_skb_cb { @@ -65,9 +73,6 @@ struct sja1105_skb_cb { ((struct sja1105_skb_cb *)((skb)->cb)) struct sja1105_port { - struct kthread_worker *xmit_worker; - struct kthread_work xmit_work; - struct sk_buff_head xmit_queue; struct sja1105_tagger_data *data; bool hwts_tx_en; }; -- cgit v1.2.3 From 6f6770ab1ce2b56619264ec6be0b62f05564dcf6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 10 Dec 2021 01:34:41 +0200 Subject: net: dsa: sja1105: remove hwts_tx_en from tagger data This tagger property is in fact not used at all by the tagger, only by the switch driver. Therefore it makes sense to be moved to sja1105_private. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index acd9d2afccab..32a8a1344cf6 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -74,7 +74,6 @@ struct sja1105_skb_cb { struct sja1105_port { struct sja1105_tagger_data *data; - bool hwts_tx_en; }; /* Timestamps are in units of 8 ns clock ticks (equivalent to -- cgit v1.2.3 From bfcf1425222008e7390c0784b0f3bb7b497fccaa Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 10 Dec 2021 01:34:42 +0200 Subject: net: dsa: sja1105: make dp->priv point directly to sja1105_tagger_data The design of the sja1105 tagger dp->priv is that each port has a separate struct sja1105_port, and the sp->data pointer points to a common struct sja1105_tagger_data. We have removed all per-port members accessible by the tagger, and now only struct sja1105_tagger_data remains. Make dp->priv point directly to this. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 32a8a1344cf6..1dda9cce85d9 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -43,9 +43,7 @@ struct sja1105_deferred_xmit_work { struct kthread_work work; }; -/* Global tagger data: each struct sja1105_port has a reference to - * the structure defined in struct sja1105_private. - */ +/* Global tagger data */ struct sja1105_tagger_data { struct sk_buff *stampable_skb; /* Protects concurrent access to the meta state machine @@ -72,10 +70,6 @@ struct sja1105_skb_cb { #define SJA1105_SKB_CB(skb) \ ((struct sja1105_skb_cb *)((skb)->cb)) -struct sja1105_port { - struct sja1105_tagger_data *data; -}; - /* Timestamps are in units of 8 ns clock ticks (equivalent to * a fixed 125 MHz clock). */ -- cgit v1.2.3 From 22ee9f8e4011fb8a6d75dc2f9a43360d4f400235 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 10 Dec 2021 01:34:43 +0200 Subject: net: dsa: sja1105: move ts_id from sja1105_tagger_data The TX timestamp ID is incremented by the SJA1110 PTP timestamping callback (->port_tx_timestamp) for every packet, when cloning it. It isn't used by the tagger at all, even though it sits inside the struct sja1105_tagger_data. Also, serialization to this structure is currently done through tagger_data->meta_lock, which is a cheap hack because the meta_lock isn't used for anything else on SJA1110 (sja1105_rcv_meta_state_machine isn't called). This change moves ts_id from sja1105_tagger_data to sja1105_private and introduces a dedicated spinlock for it, also in sja1105_private. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 1dda9cce85d9..d8ee53085c09 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -51,7 +51,6 @@ struct sja1105_tagger_data { */ spinlock_t meta_lock; unsigned long state; - u8 ts_id; /* Used on SJA1110 where meta frames are generated only for * 2-step TX timestamps */ -- cgit v1.2.3 From c79e84866d2ac637fce921a28288f214e91d662b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 10 Dec 2021 01:34:44 +0200 Subject: net: dsa: tag_sja1105: convert to tagger-owned data Currently, struct sja1105_tagger_data is a part of struct sja1105_private, and is used by the sja1105 driver to populate dp->priv. With the movement towards tagger-owned storage, the sja1105 driver should not be the owner of this memory. This change implements the connection between the sja1105 switch driver and its tagging protocol, which means that sja1105_tagger_data no longer stays in dp->priv but in ds->tagger_data, and that the sja1105 driver now only populates the sja1105_port_deferred_xmit callback pointer. The kthread worker is now the responsibility of the tagger. The sja1105 driver also alters the tagger's state some more, especially with regard to the PTP RX timestamping state. This will be fixed up a bit in further changes. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index d8ee53085c09..9f7d42cbbc08 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -84,9 +84,12 @@ static inline s64 sja1105_ticks_to_ns(s64 ticks) return ticks * SJA1105_TICK_NS; } -static inline bool dsa_port_is_sja1105(struct dsa_port *dp) +static inline struct sja1105_tagger_data * +sja1105_tagger_data(struct dsa_switch *ds) { - return true; + BUG_ON(ds->dst->tag_ops->proto != DSA_TAG_PROTO_SJA1105); + + return ds->tagger_data; } #endif /* _NET_DSA_SJA1105_H */ -- cgit v1.2.3 From fcbf979a5b4b5784bfb5647ae6190cd5c2ae595d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 10 Dec 2021 01:34:45 +0200 Subject: Revert "net: dsa: move sja1110_process_meta_tstamp inside the tagging protocol driver" This reverts commit 6d709cadfde68dbd12bef12fcced6222226dcb06. The above change was done to avoid calling symbols exported by the switch driver from the tagging protocol driver. With the tagger-owned storage model, we have a new option on our hands, and that is for the switch driver to provide a data consumer handler in the form of a function pointer inside the ->connect_tag_protocol() method. Having a function pointer avoids the problems of the exported symbols approach. By creating a handler for metadata frames holding TX timestamps on SJA1110, we are able to eliminate an skb queue from the tagger data, and replace it with a simple, and stateless, function pointer. This skb queue is now handled exclusively by sja1105_ptp.c, which makes the code easier to follow, as it used to be before the reverted patch. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 9f7d42cbbc08..d216211b64f8 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -37,6 +37,11 @@ #define SJA1105_HWTS_RX_EN 0 +enum sja1110_meta_tstamp { + SJA1110_META_TSTAMP_TX = 0, + SJA1110_META_TSTAMP_RX = 1, +}; + struct sja1105_deferred_xmit_work { struct dsa_port *dp; struct sk_buff *skb; @@ -51,12 +56,10 @@ struct sja1105_tagger_data { */ spinlock_t meta_lock; unsigned long state; - /* Used on SJA1110 where meta frames are generated only for - * 2-step TX timestamps - */ - struct sk_buff_head skb_txtstamp_queue; struct kthread_worker *xmit_worker; void (*xmit_work_fn)(struct kthread_work *work); + void (*meta_tstamp_handler)(struct dsa_switch *ds, int port, u8 ts_id, + enum sja1110_meta_tstamp dir, u64 tstamp); }; struct sja1105_skb_cb { @@ -69,21 +72,6 @@ struct sja1105_skb_cb { #define SJA1105_SKB_CB(skb) \ ((struct sja1105_skb_cb *)((skb)->cb)) -/* Timestamps are in units of 8 ns clock ticks (equivalent to - * a fixed 125 MHz clock). - */ -#define SJA1105_TICK_NS 8 - -static inline s64 ns_to_sja1105_ticks(s64 ns) -{ - return ns / SJA1105_TICK_NS; -} - -static inline s64 sja1105_ticks_to_ns(s64 ticks) -{ - return ticks * SJA1105_TICK_NS; -} - static inline struct sja1105_tagger_data * sja1105_tagger_data(struct dsa_switch *ds) { -- cgit v1.2.3 From 950a419d9de13668f86828394cb242a1f9dece74 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 10 Dec 2021 01:34:46 +0200 Subject: net: dsa: tag_sja1105: split sja1105_tagger_data into private and public sections The sja1105 driver messes with the tagging protocol's state when PTP RX timestamping is enabled/disabled. This is fundamentally necessary because the tagger needs to know what to do when it receives a PTP packet. If RX timestamping is enabled, then a metadata follow-up frame is expected, and this holds the (partial) timestamp. So the tagger plays hide-and-seek with the network stack until it also gets the metadata frame, and then presents a single packet, the timestamped PTP packet. But when RX timestamping isn't enabled, there is no metadata frame expected, so the hide-and-seek game must be turned off and the packet must be delivered right away to the network stack. Considering this, we create a pseudo isolation by devising two tagger methods callable by the switch: one to get the RX timestamping state, and one to set it. Since we can't export symbols between the tagger and the switch driver, these methods are exposed through function pointers. After this change, the public portion of the sja1105_tagger_data contains only function pointers. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index d216211b64f8..e9cb1ae6d742 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -35,8 +35,6 @@ #define SJA1105_META_SMAC 0x222222222222ull #define SJA1105_META_DMAC 0x0180C200000Eull -#define SJA1105_HWTS_RX_EN 0 - enum sja1110_meta_tstamp { SJA1110_META_TSTAMP_TX = 0, SJA1110_META_TSTAMP_RX = 1, @@ -50,16 +48,13 @@ struct sja1105_deferred_xmit_work { /* Global tagger data */ struct sja1105_tagger_data { - struct sk_buff *stampable_skb; - /* Protects concurrent access to the meta state machine - * from taggers running on multiple ports on SMP systems - */ - spinlock_t meta_lock; - unsigned long state; - struct kthread_worker *xmit_worker; + /* Tagger to switch */ void (*xmit_work_fn)(struct kthread_work *work); void (*meta_tstamp_handler)(struct dsa_switch *ds, int port, u8 ts_id, enum sja1110_meta_tstamp dir, u64 tstamp); + /* Switch to tagger */ + bool (*rxtstamp_get_state)(struct dsa_switch *ds); + void (*rxtstamp_set_state)(struct dsa_switch *ds, bool on); }; struct sja1105_skb_cb { -- cgit v1.2.3 From 3c118547f87e930d45a5787e386734015dd93b32 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 10 Dec 2021 21:29:59 +0100 Subject: u64_stats: Disable preemption on 32bit UP+SMP PREEMPT_RT during updates. On PREEMPT_RT the seqcount_t for synchronisation is required on 32bit architectures even on UP because the softirq (and the threaded IRQ handler) can be preempted. With the seqcount_t for synchronisation, a reader with higher priority can preempt the writer and then spin endlessly in read_seqcount_begin() while the writer can't make progress. To avoid such a lock up on PREEMPT_RT the writer must disable preemption during the update. There is no need to disable interrupts because no writer is using this API in hard-IRQ context on PREEMPT_RT. Disable preemption on 32bit-RT within the u64_stats write section. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller --- include/linux/u64_stats_sync.h | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index e8ec116c916b..6ad4e9032d53 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -66,7 +66,7 @@ #include struct u64_stats_sync { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) seqcount_t seq; #endif }; @@ -125,7 +125,7 @@ static inline void u64_stats_inc(u64_stats_t *p) } #endif -#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) #define u64_stats_init(syncp) seqcount_init(&(syncp)->seq) #else static inline void u64_stats_init(struct u64_stats_sync *syncp) @@ -135,15 +135,19 @@ static inline void u64_stats_init(struct u64_stats_sync *syncp) static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); write_seqcount_begin(&syncp->seq); #endif } static inline void u64_stats_update_end(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) write_seqcount_end(&syncp->seq); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); #endif } @@ -152,8 +156,11 @@ u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) { unsigned long flags = 0; -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - local_irq_save(flags); +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + else + local_irq_save(flags); write_seqcount_begin(&syncp->seq); #endif return flags; @@ -163,15 +170,18 @@ static inline void u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, unsigned long flags) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) write_seqcount_end(&syncp->seq); - local_irq_restore(flags); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); + else + local_irq_restore(flags); #endif } static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) return read_seqcount_begin(&syncp->seq); #else return 0; @@ -180,7 +190,7 @@ static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync * static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) preempt_disable(); #endif return __u64_stats_fetch_begin(syncp); @@ -189,7 +199,7 @@ static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *sy static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) return read_seqcount_retry(&syncp->seq, start); #else return false; @@ -199,7 +209,7 @@ static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) preempt_enable(); #endif return __u64_stats_fetch_retry(syncp, start); @@ -213,7 +223,9 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, */ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) + preempt_disable(); +#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) local_irq_disable(); #endif return __u64_stats_fetch_begin(syncp); @@ -222,7 +234,9 @@ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) + preempt_enable(); +#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) local_irq_enable(); #endif return __u64_stats_fetch_retry(syncp, start); -- cgit v1.2.3 From f92c1e183604c20ce00eb889315fdaa8f2d9e509 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 8 Dec 2021 20:32:44 +0100 Subject: bpf: Add get_func_[arg|ret|arg_cnt] helpers Adding following helpers for tracing programs: Get n-th argument of the traced function: long bpf_get_func_arg(void *ctx, u32 n, u64 *value) Get return value of the traced function: long bpf_get_func_ret(void *ctx, u64 *value) Get arguments count of the traced function: long bpf_get_func_arg_cnt(void *ctx) The trampoline now stores number of arguments on ctx-8 address, so it's easy to verify argument index and find return value argument's position. Moving function ip address on the trampoline stack behind the number of functions arguments, so it's now stored on ctx-16 address if it's needed. All helpers above are inlined by verifier. Also bit unrelated small change - using newly added function bpf_prog_has_trampoline in check_get_func_ip. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211208193245.172141-5-jolsa@kernel.org --- include/linux/bpf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7a40022e3d00..965fffaf0308 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -777,6 +777,7 @@ void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); int bpf_jit_charge_modmem(u32 pages); void bpf_jit_uncharge_modmem(u32 pages); +bool bpf_prog_has_trampoline(const struct bpf_prog *prog); #else static inline int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) @@ -805,6 +806,10 @@ static inline bool is_bpf_image_address(unsigned long address) { return false; } +static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog) +{ + return false; +} #endif struct bpf_func_info_aux { -- cgit v1.2.3 From c8064e5b4adac5e1255cf4f3b374e75b5376e7ca Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 30 Nov 2021 11:08:07 +0100 Subject: bpf: Let bpf_warn_invalid_xdp_action() report more info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In non trivial scenarios, the action id alone is not sufficient to identify the program causing the warning. Before the previous patch, the generated stack-trace pointed out at least the involved device driver. Let's additionally include the program name and id, and the relevant device name. If the user needs additional infos, he can fetch them via a kernel probe, leveraging the arguments added here. Signed-off-by: Paolo Abeni Signed-off-by: Daniel Borkmann Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/ddb96bb975cbfddb1546cf5da60e77d5100b533c.1638189075.git.pabeni@redhat.com --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 68c6b5c208e7..60eec80fa1d4 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1027,7 +1027,7 @@ void xdp_do_flush(void); */ #define xdp_do_flush_map xdp_do_flush -void bpf_warn_invalid_xdp_action(u32 act); +void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act); #ifdef CONFIG_INET struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, -- cgit v1.2.3 From 22c3f2f56bd9c901e1da69040680c311f310635d Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Wed, 1 Dec 2021 11:36:18 -0800 Subject: net/mlx5: Separate FDB namespace This patch doesn't add an additional namespaces, but just separates the naming to be used by each FDB user, bypass and kernel. Downstream patches will actually split this up and allow to have more than single priority for the bypass users. Signed-off-by: Maor Gottlieb Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed Acked-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index cd2d4c572367..b1aad14689e3 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -73,6 +73,7 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_KERNEL, MLX5_FLOW_NAMESPACE_LEFTOVERS, MLX5_FLOW_NAMESPACE_ANCHOR, + MLX5_FLOW_NAMESPACE_FDB_BYPASS, MLX5_FLOW_NAMESPACE_FDB, MLX5_FLOW_NAMESPACE_ESW_EGRESS, MLX5_FLOW_NAMESPACE_ESW_INGRESS, -- cgit v1.2.3 From c8a2a011cd04054a6577d8cf774adf9e09302876 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 14 Dec 2021 03:45:35 +0200 Subject: net: dsa: sja1105: fix broken connection with the sja1110 tagger The driver was incorrectly converted assuming that "sja1105" is the only tagger supported by this driver. This results in SJA1110 switches failing to probe: sja1105 spi1.0: Unable to connect to tag protocol "sja1110": -EPROTONOSUPPORT sja1105: probe of spi1.2 failed with error -93 Add DSA_TAG_PROTO_SJA1110 to the list of supported taggers by the sja1105 driver. The sja1105_tagger_data structure format is common for the two tagging protocols. Fixes: c79e84866d2a ("net: dsa: tag_sja1105: convert to tagger-owned data") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index e9cb1ae6d742..159e43171ccc 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -70,7 +70,8 @@ struct sja1105_skb_cb { static inline struct sja1105_tagger_data * sja1105_tagger_data(struct dsa_switch *ds) { - BUG_ON(ds->dst->tag_ops->proto != DSA_TAG_PROTO_SJA1105); + BUG_ON(ds->dst->tag_ops->proto != DSA_TAG_PROTO_SJA1105 && + ds->dst->tag_ops->proto != DSA_TAG_PROTO_SJA1110); return ds->tagger_data; } -- cgit v1.2.3 From 9280ac2e6f199cddcd746a9ba459136b8666287b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Dec 2021 07:15:15 -0800 Subject: net: dev_replace_track() cleanup Use existing helpers (netdev_tracker_free() and netdev_tracker_alloc()) to remove ifdefery. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20211214151515.312535-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 235d5d082f1a..c06e9dc1a317 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3885,16 +3885,14 @@ static inline void dev_replace_track(struct net_device *odev, netdevice_tracker *tracker, gfp_t gfp) { -#ifdef CONFIG_NET_DEV_REFCNT_TRACKER if (odev) - ref_tracker_free(&odev->refcnt_tracker, tracker); -#endif + netdev_tracker_free(odev, tracker); + dev_hold(ndev); dev_put(odev); -#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + if (ndev) - ref_tracker_alloc(&ndev->refcnt_tracker, tracker, gfp); -#endif + netdev_tracker_alloc(ndev, tracker, gfp); } /* Carrier loss detection, dial on demand. The functions netif_carrier_on -- cgit v1.2.3 From f1d9268e061863ead77b07f5a6807d063e28a1c2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Dec 2021 07:09:33 -0800 Subject: net: add net device refcount tracker to struct packet_type Most notable changes are in af_packet, tipc ones are trivial. Signed-off-by: Eric Dumazet Cc: Jon Maloy Cc: Ying Xue Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c06e9dc1a317..a419718612c6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2533,6 +2533,7 @@ struct packet_type { __be16 type; /* This is really htons(ether_type). */ bool ignore_outgoing; struct net_device *dev; /* NULL is wildcarded here */ + netdevice_tracker dev_tracker; int (*func) (struct sk_buff *, struct net_device *, struct packet_type *, -- cgit v1.2.3 From 685b1afd7911676691c4167f420e16a957f5a38e Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 9 Dec 2021 12:09:23 +0200 Subject: net/mlx5: Introduce log_max_current_uc_list_wr_supported bit Downstream patch will use this bit in order to know whether the device supports changing of max_uc_list. Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3636df90899a..d3899fc33fd7 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1621,7 +1621,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 ext_stride_num_range[0x1]; u8 roce_rw_supported[0x1]; - u8 reserved_at_3a2[0x1]; + u8 log_max_current_uc_list_wr_supported[0x1]; u8 log_max_stride_sz_rq[0x5]; u8 reserved_at_3a8[0x3]; u8 log_min_stride_sz_rq[0x5]; -- cgit v1.2.3 From d1e86325af377129adb7fc6f34eb044ca6068b47 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 15 Dec 2021 15:34:15 +0000 Subject: net: phylink: add mac_select_pcs() method to phylink_mac_ops mac_select_pcs() allows us to have an explicit point to query which PCS the MAC wishes to use for a particular PHY interface mode, thereby allowing us to add support to validate the link settings with the PCS. Phylink will also use this to select the PCS to be used during a major configuration event without the MAC driver needing to call phylink_set_pcs(). Note that if mac_select_pcs() is present, the supported_interfaces bitmap must be filled in; this avoids mac_select_pcs() being called with PHY_INTERFACE_MODE_NA when we want to get support for all interface types. Phylink will return an error in phylink_create() unless this condition is satisfied. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phylink.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index a2f266cc3442..b3086dcafeaf 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -112,6 +112,7 @@ struct phylink_config { /** * struct phylink_mac_ops - MAC operations structure. * @validate: Validate and update the link configuration. + * @mac_select_pcs: Select a PCS for the interface mode. * @mac_pcs_get_state: Read the current link state from the hardware. * @mac_prepare: prepare for a major reconfiguration of the interface. * @mac_config: configure the MAC for the selected mode and state. @@ -126,6 +127,8 @@ struct phylink_mac_ops { void (*validate)(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); + struct phylink_pcs *(*mac_select_pcs)(struct phylink_config *config, + phy_interface_t interface); void (*mac_pcs_get_state)(struct phylink_config *config, struct phylink_link_state *state); int (*mac_prepare)(struct phylink_config *config, unsigned int mode, @@ -178,6 +181,21 @@ struct phylink_mac_ops { */ void validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); +/** + * mac_select_pcs: Select a PCS for the interface mode. + * @config: a pointer to a &struct phylink_config. + * @interface: PHY interface mode for PCS + * + * Return the &struct phylink_pcs for the specified interface mode, or + * NULL if none is required, or an error pointer on error. + * + * This must not modify any state. It is used to query which PCS should + * be used. Phylink will use this during validation to ensure that the + * configuration is valid, and when setting a configuration to internally + * set the PCS that will be used. + */ +struct phylink_pcs *mac_select_pcs(struct phylink_config *config, + phy_interface_t interface); /** * mac_pcs_get_state() - Read the current inband link state from the hardware -- cgit v1.2.3 From 0d22d4b626a4eaa3196019092eb6c1919e9f8caa Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 15 Dec 2021 15:34:20 +0000 Subject: net: phylink: add pcs_validate() method Add a hook for PCS to validate the link parameters. This avoids MAC drivers having to have knowledge of their PCS in their validate() method, thereby allowing several MAC drivers to be simplfied. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phylink.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index b3086dcafeaf..713a0c928b7c 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -416,6 +416,7 @@ struct phylink_pcs { /** * struct phylink_pcs_ops - MAC PCS operations structure. + * @pcs_validate: validate the link configuration. * @pcs_get_state: read the current MAC PCS link state from the hardware. * @pcs_config: configure the MAC PCS for the selected mode and state. * @pcs_an_restart: restart 802.3z BaseX autonegotiation. @@ -423,6 +424,8 @@ struct phylink_pcs { * (where necessary). */ struct phylink_pcs_ops { + int (*pcs_validate)(struct phylink_pcs *pcs, unsigned long *supported, + const struct phylink_link_state *state); void (*pcs_get_state)(struct phylink_pcs *pcs, struct phylink_link_state *state); int (*pcs_config)(struct phylink_pcs *pcs, unsigned int mode, @@ -435,6 +438,23 @@ struct phylink_pcs_ops { }; #if 0 /* For kernel-doc purposes only. */ +/** + * pcs_validate() - validate the link configuration. + * @pcs: a pointer to a &struct phylink_pcs. + * @supported: ethtool bitmask for supported link modes. + * @state: a const pointer to a &struct phylink_link_state. + * + * Validate the interface mode, and advertising's autoneg bit, removing any + * media ethtool link modes that would not be supportable from the supported + * mask. Phylink will propagate the changes to the advertising mask. See the + * &struct phylink_mac_ops validate() method. + * + * Returns -EINVAL if the interface mode/autoneg mode is not supported. + * Returns non-zero positive if the link state can be supported. + */ +int pcs_validate(struct phylink_pcs *pcs, unsigned long *supported, + const struct phylink_link_state *state); + /** * pcs_get_state() - Read the current inband link state from the hardware * @pcs: a pointer to a &struct phylink_pcs. -- cgit v1.2.3 From f7ea534a0920dbaf71a8003936e178e14ec9271d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 15 Dec 2021 18:55:36 -0800 Subject: add includes masked by cgroup -> bpf dependency cgroup pulls in BPF which pulls in a lot of includes. We're about to break that chain so fix those who were depending on it. Signed-off-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211216025538.1649516-2-kuba@kernel.org --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0dcfd265beed..4a021149eaf0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -611,6 +611,7 @@ struct swevent_hlist { #define PERF_ATTACH_SCHED_CB 0x20 #define PERF_ATTACH_CHILD 0x40 +struct bpf_prog; struct perf_cgroup; struct perf_buffer; -- cgit v1.2.3 From fd1740b6abac39f68ce12e201697f106e0f1d519 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 15 Dec 2021 18:55:38 -0800 Subject: bpf: Remove the cgroup -> bpf header dependecy Remove the dependency from cgroup-defs.h to bpf-cgroup.h and bpf.h. This reduces the incremental build size of x86 allmodconfig after bpf.h was touched from ~17k objects rebuilt to ~5k objects. bpf.h is 2.2kLoC and is modified relatively often. We need a new header with just the definition of struct cgroup_bpf and enum cgroup_bpf_attach_type, this is akin to cgroup-defs.h. Signed-off-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Acked-by: Tejun Heo Link: https://lore.kernel.org/bpf/20211216025538.1649516-4-kuba@kernel.org --- include/linux/bpf-cgroup-defs.h | 70 +++++++++++++++++++++++++++++++++++++++++ include/linux/bpf-cgroup.h | 57 +-------------------------------- include/linux/cgroup-defs.h | 2 +- 3 files changed, 72 insertions(+), 57 deletions(-) create mode 100644 include/linux/bpf-cgroup-defs.h (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h new file mode 100644 index 000000000000..695d1224a71b --- /dev/null +++ b/include/linux/bpf-cgroup-defs.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BPF_CGROUP_DEFS_H +#define _BPF_CGROUP_DEFS_H + +#ifdef CONFIG_CGROUP_BPF + +#include +#include +#include + +struct bpf_prog_array; + +enum cgroup_bpf_attach_type { + CGROUP_BPF_ATTACH_TYPE_INVALID = -1, + CGROUP_INET_INGRESS = 0, + CGROUP_INET_EGRESS, + CGROUP_INET_SOCK_CREATE, + CGROUP_SOCK_OPS, + CGROUP_DEVICE, + CGROUP_INET4_BIND, + CGROUP_INET6_BIND, + CGROUP_INET4_CONNECT, + CGROUP_INET6_CONNECT, + CGROUP_INET4_POST_BIND, + CGROUP_INET6_POST_BIND, + CGROUP_UDP4_SENDMSG, + CGROUP_UDP6_SENDMSG, + CGROUP_SYSCTL, + CGROUP_UDP4_RECVMSG, + CGROUP_UDP6_RECVMSG, + CGROUP_GETSOCKOPT, + CGROUP_SETSOCKOPT, + CGROUP_INET4_GETPEERNAME, + CGROUP_INET6_GETPEERNAME, + CGROUP_INET4_GETSOCKNAME, + CGROUP_INET6_GETSOCKNAME, + CGROUP_INET_SOCK_RELEASE, + MAX_CGROUP_BPF_ATTACH_TYPE +}; + +struct cgroup_bpf { + /* array of effective progs in this cgroup */ + struct bpf_prog_array __rcu *effective[MAX_CGROUP_BPF_ATTACH_TYPE]; + + /* attached progs to this cgroup and attach flags + * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will + * have either zero or one element + * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS + */ + struct list_head progs[MAX_CGROUP_BPF_ATTACH_TYPE]; + u32 flags[MAX_CGROUP_BPF_ATTACH_TYPE]; + + /* list of cgroup shared storages */ + struct list_head storages; + + /* temp storage for effective prog array used by prog_attach/detach */ + struct bpf_prog_array *inactive; + + /* reference counter used to detach bpf programs after cgroup removal */ + struct percpu_ref refcnt; + + /* cgroup_bpf is released using a work queue */ + struct work_struct release_work; +}; + +#else /* CONFIG_CGROUP_BPF */ +struct cgroup_bpf {}; +#endif /* CONFIG_CGROUP_BPF */ + +#endif diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 11820a430d6c..b525d8cdc25b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -3,10 +3,10 @@ #define _BPF_CGROUP_H #include +#include #include #include #include -#include #include #include @@ -23,33 +23,6 @@ struct ctl_table_header; struct task_struct; #ifdef CONFIG_CGROUP_BPF -enum cgroup_bpf_attach_type { - CGROUP_BPF_ATTACH_TYPE_INVALID = -1, - CGROUP_INET_INGRESS = 0, - CGROUP_INET_EGRESS, - CGROUP_INET_SOCK_CREATE, - CGROUP_SOCK_OPS, - CGROUP_DEVICE, - CGROUP_INET4_BIND, - CGROUP_INET6_BIND, - CGROUP_INET4_CONNECT, - CGROUP_INET6_CONNECT, - CGROUP_INET4_POST_BIND, - CGROUP_INET6_POST_BIND, - CGROUP_UDP4_SENDMSG, - CGROUP_UDP6_SENDMSG, - CGROUP_SYSCTL, - CGROUP_UDP4_RECVMSG, - CGROUP_UDP6_RECVMSG, - CGROUP_GETSOCKOPT, - CGROUP_SETSOCKOPT, - CGROUP_INET4_GETPEERNAME, - CGROUP_INET6_GETPEERNAME, - CGROUP_INET4_GETSOCKNAME, - CGROUP_INET6_GETSOCKNAME, - CGROUP_INET_SOCK_RELEASE, - MAX_CGROUP_BPF_ATTACH_TYPE -}; #define CGROUP_ATYPE(type) \ case BPF_##type: return type @@ -127,33 +100,6 @@ struct bpf_prog_list { struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; }; -struct bpf_prog_array; - -struct cgroup_bpf { - /* array of effective progs in this cgroup */ - struct bpf_prog_array __rcu *effective[MAX_CGROUP_BPF_ATTACH_TYPE]; - - /* attached progs to this cgroup and attach flags - * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will - * have either zero or one element - * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS - */ - struct list_head progs[MAX_CGROUP_BPF_ATTACH_TYPE]; - u32 flags[MAX_CGROUP_BPF_ATTACH_TYPE]; - - /* list of cgroup shared storages */ - struct list_head storages; - - /* temp storage for effective prog array used by prog_attach/detach */ - struct bpf_prog_array *inactive; - - /* reference counter used to detach bpf programs after cgroup removal */ - struct percpu_ref refcnt; - - /* cgroup_bpf is released using a work queue */ - struct work_struct release_work; -}; - int cgroup_bpf_inherit(struct cgroup *cgrp); void cgroup_bpf_offline(struct cgroup *cgrp); @@ -451,7 +397,6 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); #else -struct cgroup_bpf {}; static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } static inline void cgroup_bpf_offline(struct cgroup *cgrp) {} diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index db2e147e069f..411684c80cf3 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #ifdef CONFIG_CGROUPS -- cgit v1.2.3 From 0f55f9ed21f96630c6ec96805d42f92c0b458b37 Mon Sep 17 00:00:00 2001 From: Christy Lee Date: Thu, 16 Dec 2021 13:33:56 -0800 Subject: bpf: Only print scratched registers and stack slots to verifier logs. When printing verifier state for any log level, print full verifier state only on function calls or on errors. Otherwise, only print the registers and stack slots that were accessed. Log size differences: verif_scale_loop6 before: 234566564 verif_scale_loop6 after: 72143943 69% size reduction kfree_skb before: 166406 kfree_skb after: 55386 69% size reduction Before: 156: (61) r0 = *(u32 *)(r1 +0) 157: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=ctx(id=0,off=0,imm=0) R2_w=invP0 R10=fp0 fp-8_w=00000000 fp-16_w=00\ 000000 fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000 fp-56_w=00000000 fp-64_w=00000000 fp-72_w=00000000 fp-80_w=00000\ 000 fp-88_w=00000000 fp-96_w=00000000 fp-104_w=00000000 fp-112_w=00000000 fp-120_w=00000000 fp-128_w=00000000 fp-136_w=00000000 fp-144_w=00\ 000000 fp-152_w=00000000 fp-160_w=00000000 fp-168_w=00000000 fp-176_w=00000000 fp-184_w=00000000 fp-192_w=00000000 fp-200_w=00000000 fp-208\ _w=00000000 fp-216_w=00000000 fp-224_w=00000000 fp-232_w=00000000 fp-240_w=00000000 fp-248_w=00000000 fp-256_w=00000000 fp-264_w=00000000 f\ p-272_w=00000000 fp-280_w=00000000 fp-288_w=00000000 fp-296_w=00000000 fp-304_w=00000000 fp-312_w=00000000 fp-320_w=00000000 fp-328_w=00000\ 000 fp-336_w=00000000 fp-344_w=00000000 fp-352_w=00000000 fp-360_w=00000000 fp-368_w=00000000 fp-376_w=00000000 fp-384_w=00000000 fp-392_w=\ 00000000 fp-400_w=00000000 fp-408_w=00000000 fp-416_w=00000000 fp-424_w=00000000 fp-432_w=00000000 fp-440_w=00000000 fp-448_w=00000000 ; return skb->len; 157: (95) exit Func#4 is safe for any args that match its prototype Validating get_constant() func#5... 158: R1=invP(id=0) R10=fp0 ; int get_constant(long val) 158: (bf) r0 = r1 159: R0_w=invP(id=1) R1=invP(id=1) R10=fp0 ; return val - 122; 159: (04) w0 += -122 160: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=invP(id=1) R10=fp0 ; return val - 122; 160: (95) exit Func#5 is safe for any args that match its prototype Validating get_skb_ifindex() func#6... 161: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R3=invP(id=0) R10=fp0 ; int get_skb_ifindex(int val, struct __sk_buff *skb, int var) 161: (bc) w0 = w3 162: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R3=invP(id=0) R10=fp0 After: 156: (61) r0 = *(u32 *)(r1 +0) 157: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=ctx(id=0,off=0,imm=0) ; return skb->len; 157: (95) exit Func#4 is safe for any args that match its prototype Validating get_constant() func#5... 158: R1=invP(id=0) R10=fp0 ; int get_constant(long val) 158: (bf) r0 = r1 159: R0_w=invP(id=1) R1=invP(id=1) ; return val - 122; 159: (04) w0 += -122 160: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) ; return val - 122; 160: (95) exit Func#5 is safe for any args that match its prototype Validating get_skb_ifindex() func#6... 161: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R3=invP(id=0) R10=fp0 ; int get_skb_ifindex(int val, struct __sk_buff *skb, int var) 161: (bc) w0 = w3 162: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R3=invP(id=0) Signed-off-by: Christy Lee Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211216213358.3374427-2-christylee@fb.com --- include/linux/bpf_verifier.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 182b16a91084..c66f238c538d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -474,6 +474,13 @@ struct bpf_verifier_env { /* longest register parentage chain walked for liveness marking */ u32 longest_mark_read_walk; bpfptr_t fd_array; + + /* bit mask to keep track of whether a register has been accessed + * since the last time the function state was printed + */ + u32 scratched_regs; + /* Same as scratched_regs but for stack slots */ + u64 scratched_stack_slots; }; __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, -- cgit v1.2.3 From 2e5766483c8c5cf886b4dc647a1741738dde7d79 Mon Sep 17 00:00:00 2001 From: Christy Lee Date: Thu, 16 Dec 2021 19:42:45 -0800 Subject: bpf: Right align verifier states in verifier logs. Make the verifier logs more readable, print the verifier states on the corresponding instruction line. If the previous line was not a bpf instruction, then print the verifier states on its own line. Before: Validating test_pkt_access_subprog3() func#3... 86: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R10=fp0 ; int test_pkt_access_subprog3(int val, struct __sk_buff *skb) 86: (bf) r6 = r2 87: R2=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0) 87: (bc) w7 = w1 88: R1=invP(id=0) R7_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) ; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123)); 88: (bf) r1 = r6 89: R1_w=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0) 89: (85) call pc+9 Func#4 is global and valid. Skipping. 90: R0_w=invP(id=0) 90: (bc) w8 = w0 91: R0_w=invP(id=0) R8_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) ; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123)); 91: (b7) r1 = 123 92: R1_w=invP123 92: (85) call pc+65 Func#5 is global and valid. Skipping. 93: R0=invP(id=0) After: 86: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R10=fp0 ; int test_pkt_access_subprog3(int val, struct __sk_buff *skb) 86: (bf) r6 = r2 ; R2=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0) 87: (bc) w7 = w1 ; R1=invP(id=0) R7_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) ; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123)); 88: (bf) r1 = r6 ; R1_w=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0) 89: (85) call pc+9 Func#4 is global and valid. Skipping. 90: R0_w=invP(id=0) 90: (bc) w8 = w0 ; R0_w=invP(id=0) R8_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) ; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123)); 91: (b7) r1 = 123 ; R1_w=invP123 92: (85) call pc+65 Func#5 is global and valid. Skipping. 93: R0=invP(id=0) Signed-off-by: Christy Lee Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c66f238c538d..ee931398f311 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -388,6 +388,8 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) #define BPF_LOG_LEVEL (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2) #define BPF_LOG_MASK (BPF_LOG_LEVEL | BPF_LOG_STATS) #define BPF_LOG_KERNEL (BPF_LOG_MASK + 1) /* kernel internal flag */ +#define BPF_LOG_MIN_ALIGNMENT 8U +#define BPF_LOG_ALIGNMENT 40U static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { @@ -481,6 +483,7 @@ struct bpf_verifier_env { u32 scratched_regs; /* Same as scratched_regs but for stack slots */ u64 scratched_stack_slots; + u32 prev_log_len, prev_insn_print_len; }; __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, -- cgit v1.2.3 From bd0b536dc2e1e9828a85b1e3470ee7bafc3b36f6 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Mon, 29 Nov 2021 16:15:59 -0800 Subject: virtchnl: Add support for new VLAN capabilities Currently VIRTCHNL only allows for VLAN filtering and offloads to happen on a single 802.1Q VLAN. Add support to filter and offload on inner, outer, and/or inner + outer VLANs. This is done by introducing the new capability VIRTCHNL_VF_OFFLOAD_VLAN_V2. The flow to negotiate this new capability is shown below. 1. VF - sets the VIRTCHNL_VF_OFFLOAD_VLAN_V2 bit in the virtchnl_vf_resource.vf_caps_flags during the VIRTCHNL_OP_GET_VF_RESOURCES request message. The VF should also set the VIRTCHNL_VF_OFFLOAD_VLAN bit in case the PF driver doesn't support the new capability. 2. PF - sets the VLAN capability bit it supports in the VIRTCHNL_OP_GET_VF_RESOURCES response message. This will either be VIRTCHNL_VF_OFFLOAD_VLAN_V2, VIRTCHNL_VF_OFFLOAD_VLAN, or none. 3. VF - If the VIRTCHNL_VF_OFFLOAD_VLAN_V2 capability was ACK'd by the PF, then the VF needs to request the VLAN capabilities of the PF/Device by issuing a VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS request. If the VIRTCHNL_VF_OFFLOAD_VLAN capability was ACK'd then the VF knows only single 802.1Q VLAN filtering/offloads are supported. If no VLAN capability is ACK'd then the PF/Device doesn't support hardware VLAN filtering/offloads for this VF. 4. PF - Populates the virtchnl_vlan_caps structure based on what it allows/supports for that VF and sends that response via VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS. After VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS is successfully negotiated the VF driver needs to interpret the capabilities supported by the underlying PF/Device. The VF will be allowed to filter/offload the inner 802.1Q, outer (various ethertype), inner 802.1Q + outer (various ethertypes), or none based on which fields are set. The VF will also need to interpret where the VLAN tag should be inserted and/or stripped based on the negotiated capabilities. Signed-off-by: Brett Creeley Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- include/linux/avf/virtchnl.h | 377 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 377 insertions(+) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index b30a1bc74fc7..2ce27e8e4f19 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -141,6 +141,13 @@ enum virtchnl_ops { VIRTCHNL_OP_DEL_RSS_CFG = 46, VIRTCHNL_OP_ADD_FDIR_FILTER = 47, VIRTCHNL_OP_DEL_FDIR_FILTER = 48, + VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS = 51, + VIRTCHNL_OP_ADD_VLAN_V2 = 52, + VIRTCHNL_OP_DEL_VLAN_V2 = 53, + VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 = 54, + VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2 = 55, + VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2 = 56, + VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2 = 57, VIRTCHNL_OP_MAX, }; @@ -246,6 +253,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource); #define VIRTCHNL_VF_OFFLOAD_REQ_QUEUES BIT(6) /* used to negotiate communicating link speeds in Mbps */ #define VIRTCHNL_VF_CAP_ADV_LINK_SPEED BIT(7) +#define VIRTCHNL_VF_OFFLOAD_VLAN_V2 BIT(15) #define VIRTCHNL_VF_OFFLOAD_VLAN BIT(16) #define VIRTCHNL_VF_OFFLOAD_RX_POLLING BIT(17) #define VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2 BIT(18) @@ -475,6 +483,351 @@ struct virtchnl_vlan_filter_list { VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_vlan_filter_list); +/* This enum is used for all of the VIRTCHNL_VF_OFFLOAD_VLAN_V2_CAPS related + * structures and opcodes. + * + * VIRTCHNL_VLAN_UNSUPPORTED - This field is not supported and if a VF driver + * populates it the PF should return VIRTCHNL_STATUS_ERR_NOT_SUPPORTED. + * + * VIRTCHNL_VLAN_ETHERTYPE_8100 - This field supports 0x8100 ethertype. + * VIRTCHNL_VLAN_ETHERTYPE_88A8 - This field supports 0x88A8 ethertype. + * VIRTCHNL_VLAN_ETHERTYPE_9100 - This field supports 0x9100 ethertype. + * + * VIRTCHNL_VLAN_ETHERTYPE_AND - Used when multiple ethertypes can be supported + * by the PF concurrently. For example, if the PF can support + * VIRTCHNL_VLAN_ETHERTYPE_8100 AND VIRTCHNL_VLAN_ETHERTYPE_88A8 filters it + * would OR the following bits: + * + * VIRTHCNL_VLAN_ETHERTYPE_8100 | + * VIRTCHNL_VLAN_ETHERTYPE_88A8 | + * VIRTCHNL_VLAN_ETHERTYPE_AND; + * + * The VF would interpret this as VLAN filtering can be supported on both 0x8100 + * and 0x88A8 VLAN ethertypes. + * + * VIRTCHNL_ETHERTYPE_XOR - Used when only a single ethertype can be supported + * by the PF concurrently. For example if the PF can support + * VIRTCHNL_VLAN_ETHERTYPE_8100 XOR VIRTCHNL_VLAN_ETHERTYPE_88A8 stripping + * offload it would OR the following bits: + * + * VIRTCHNL_VLAN_ETHERTYPE_8100 | + * VIRTCHNL_VLAN_ETHERTYPE_88A8 | + * VIRTCHNL_VLAN_ETHERTYPE_XOR; + * + * The VF would interpret this as VLAN stripping can be supported on either + * 0x8100 or 0x88a8 VLAN ethertypes. So when requesting VLAN stripping via + * VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 the specified ethertype will override + * the previously set value. + * + * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1 - Used to tell the VF to insert and/or + * strip the VLAN tag using the L2TAG1 field of the Tx/Rx descriptors. + * + * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2 - Used to tell the VF to insert hardware + * offloaded VLAN tags using the L2TAG2 field of the Tx descriptor. + * + * VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2 - Used to tell the VF to strip hardware + * offloaded VLAN tags using the L2TAG2_2 field of the Rx descriptor. + * + * VIRTCHNL_VLAN_PRIO - This field supports VLAN priority bits. This is used for + * VLAN filtering if the underlying PF supports it. + * + * VIRTCHNL_VLAN_TOGGLE_ALLOWED - This field is used to say whether a + * certain VLAN capability can be toggled. For example if the underlying PF/CP + * allows the VF to toggle VLAN filtering, stripping, and/or insertion it should + * set this bit along with the supported ethertypes. + */ +enum virtchnl_vlan_support { + VIRTCHNL_VLAN_UNSUPPORTED = 0, + VIRTCHNL_VLAN_ETHERTYPE_8100 = BIT(0), + VIRTCHNL_VLAN_ETHERTYPE_88A8 = BIT(1), + VIRTCHNL_VLAN_ETHERTYPE_9100 = BIT(2), + VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1 = BIT(8), + VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2 = BIT(9), + VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2 = BIT(10), + VIRTCHNL_VLAN_PRIO = BIT(24), + VIRTCHNL_VLAN_FILTER_MASK = BIT(28), + VIRTCHNL_VLAN_ETHERTYPE_AND = BIT(29), + VIRTCHNL_VLAN_ETHERTYPE_XOR = BIT(30), + VIRTCHNL_VLAN_TOGGLE = BIT(31), +}; + +/* This structure is used as part of the VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS + * for filtering, insertion, and stripping capabilities. + * + * If only outer capabilities are supported (for filtering, insertion, and/or + * stripping) then this refers to the outer most or single VLAN from the VF's + * perspective. + * + * If only inner capabilities are supported (for filtering, insertion, and/or + * stripping) then this refers to the outer most or single VLAN from the VF's + * perspective. Functionally this is the same as if only outer capabilities are + * supported. The VF driver is just forced to use the inner fields when + * adding/deleting filters and enabling/disabling offloads (if supported). + * + * If both outer and inner capabilities are supported (for filtering, insertion, + * and/or stripping) then outer refers to the outer most or single VLAN and + * inner refers to the second VLAN, if it exists, in the packet. + * + * There is no support for tunneled VLAN offloads, so outer or inner are never + * referring to a tunneled packet from the VF's perspective. + */ +struct virtchnl_vlan_supported_caps { + u32 outer; + u32 inner; +}; + +/* The PF populates these fields based on the supported VLAN filtering. If a + * field is VIRTCHNL_VLAN_UNSUPPORTED then it's not supported and the PF will + * reject any VIRTCHNL_OP_ADD_VLAN_V2 or VIRTCHNL_OP_DEL_VLAN_V2 messages using + * the unsupported fields. + * + * Also, a VF is only allowed to toggle its VLAN filtering setting if the + * VIRTCHNL_VLAN_TOGGLE bit is set. + * + * The ethertype(s) specified in the ethertype_init field are the ethertypes + * enabled for VLAN filtering. VLAN filtering in this case refers to the outer + * most VLAN from the VF's perspective. If both inner and outer filtering are + * allowed then ethertype_init only refers to the outer most VLAN as only + * VLAN ethertype supported for inner VLAN filtering is + * VIRTCHNL_VLAN_ETHERTYPE_8100. By default, inner VLAN filtering is disabled + * when both inner and outer filtering are allowed. + * + * The max_filters field tells the VF how many VLAN filters it's allowed to have + * at any one time. If it exceeds this amount and tries to add another filter, + * then the request will be rejected by the PF. To prevent failures, the VF + * should keep track of how many VLAN filters it has added and not attempt to + * add more than max_filters. + */ +struct virtchnl_vlan_filtering_caps { + struct virtchnl_vlan_supported_caps filtering_support; + u32 ethertype_init; + u16 max_filters; + u8 pad[2]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vlan_filtering_caps); + +/* This enum is used for the virtchnl_vlan_offload_caps structure to specify + * if the PF supports a different ethertype for stripping and insertion. + * + * VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION - The ethertype(s) specified + * for stripping affect the ethertype(s) specified for insertion and visa versa + * as well. If the VF tries to configure VLAN stripping via + * VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 with VIRTCHNL_VLAN_ETHERTYPE_8100 then + * that will be the ethertype for both stripping and insertion. + * + * VIRTCHNL_ETHERTYPE_MATCH_NOT_REQUIRED - The ethertype(s) specified for + * stripping do not affect the ethertype(s) specified for insertion and visa + * versa. + */ +enum virtchnl_vlan_ethertype_match { + VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION = 0, + VIRTCHNL_ETHERTYPE_MATCH_NOT_REQUIRED = 1, +}; + +/* The PF populates these fields based on the supported VLAN offloads. If a + * field is VIRTCHNL_VLAN_UNSUPPORTED then it's not supported and the PF will + * reject any VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 or + * VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2 messages using the unsupported fields. + * + * Also, a VF is only allowed to toggle its VLAN offload setting if the + * VIRTCHNL_VLAN_TOGGLE_ALLOWED bit is set. + * + * The VF driver needs to be aware of how the tags are stripped by hardware and + * inserted by the VF driver based on the level of offload support. The PF will + * populate these fields based on where the VLAN tags are expected to be + * offloaded via the VIRTHCNL_VLAN_TAG_LOCATION_* bits. The VF will need to + * interpret these fields. See the definition of the + * VIRTCHNL_VLAN_TAG_LOCATION_* bits above the virtchnl_vlan_support + * enumeration. + */ +struct virtchnl_vlan_offload_caps { + struct virtchnl_vlan_supported_caps stripping_support; + struct virtchnl_vlan_supported_caps insertion_support; + u32 ethertype_init; + u8 ethertype_match; + u8 pad[3]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(24, virtchnl_vlan_offload_caps); + +/* VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS + * VF sends this message to determine its VLAN capabilities. + * + * PF will mark which capabilities it supports based on hardware support and + * current configuration. For example, if a port VLAN is configured the PF will + * not allow outer VLAN filtering, stripping, or insertion to be configured so + * it will block these features from the VF. + * + * The VF will need to cross reference its capabilities with the PFs + * capabilities in the response message from the PF to determine the VLAN + * support. + */ +struct virtchnl_vlan_caps { + struct virtchnl_vlan_filtering_caps filtering; + struct virtchnl_vlan_offload_caps offloads; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(40, virtchnl_vlan_caps); + +struct virtchnl_vlan { + u16 tci; /* tci[15:13] = PCP and tci[11:0] = VID */ + u16 tci_mask; /* only valid if VIRTCHNL_VLAN_FILTER_MASK set in + * filtering caps + */ + u16 tpid; /* 0x8100, 0x88a8, etc. and only type(s) set in + * filtering caps. Note that tpid here does not refer to + * VIRTCHNL_VLAN_ETHERTYPE_*, but it refers to the + * actual 2-byte VLAN TPID + */ + u8 pad[2]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_vlan); + +struct virtchnl_vlan_filter { + struct virtchnl_vlan inner; + struct virtchnl_vlan outer; + u8 pad[16]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(32, virtchnl_vlan_filter); + +/* VIRTCHNL_OP_ADD_VLAN_V2 + * VIRTCHNL_OP_DEL_VLAN_V2 + * + * VF sends these messages to add/del one or more VLAN tag filters for Rx + * traffic. + * + * The PF attempts to add the filters and returns status. + * + * The VF should only ever attempt to add/del virtchnl_vlan_filter(s) using the + * supported fields negotiated via VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS. + */ +struct virtchnl_vlan_filter_list_v2 { + u16 vport_id; + u16 num_elements; + u8 pad[4]; + struct virtchnl_vlan_filter filters[1]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(40, virtchnl_vlan_filter_list_v2); + +/* VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 + * VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2 + * VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2 + * VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2 + * + * VF sends this message to enable or disable VLAN stripping or insertion. It + * also needs to specify an ethertype. The VF knows which VLAN ethertypes are + * allowed and whether or not it's allowed to enable/disable the specific + * offload via the VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS message. The VF needs to + * parse the virtchnl_vlan_caps.offloads fields to determine which offload + * messages are allowed. + * + * For example, if the PF populates the virtchnl_vlan_caps.offloads in the + * following manner the VF will be allowed to enable and/or disable 0x8100 inner + * VLAN insertion and/or stripping via the opcodes listed above. Inner in this + * case means the outer most or single VLAN from the VF's perspective. This is + * because no outer offloads are supported. See the comments above the + * virtchnl_vlan_supported_caps structure for more details. + * + * virtchnl_vlan_caps.offloads.stripping_support.inner = + * VIRTCHNL_VLAN_TOGGLE | + * VIRTCHNL_VLAN_ETHERTYPE_8100; + * + * virtchnl_vlan_caps.offloads.insertion_support.inner = + * VIRTCHNL_VLAN_TOGGLE | + * VIRTCHNL_VLAN_ETHERTYPE_8100; + * + * In order to enable inner (again note that in this case inner is the outer + * most or single VLAN from the VF's perspective) VLAN stripping for 0x8100 + * VLANs, the VF would populate the virtchnl_vlan_setting structure in the + * following manner and send the VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 message. + * + * virtchnl_vlan_setting.inner_ethertype_setting = + * VIRTCHNL_VLAN_ETHERTYPE_8100; + * + * virtchnl_vlan_setting.vport_id = vport_id or vsi_id assigned to the VF on + * initialization. + * + * The reason that VLAN TPID(s) are not being used for the + * outer_ethertype_setting and inner_ethertype_setting fields is because it's + * possible a device could support VLAN insertion and/or stripping offload on + * multiple ethertypes concurrently, so this method allows a VF to request + * multiple ethertypes in one message using the virtchnl_vlan_support + * enumeration. + * + * For example, if the PF populates the virtchnl_vlan_caps.offloads in the + * following manner the VF will be allowed to enable 0x8100 and 0x88a8 outer + * VLAN insertion and stripping simultaneously. The + * virtchnl_vlan_caps.offloads.ethertype_match field will also have to be + * populated based on what the PF can support. + * + * virtchnl_vlan_caps.offloads.stripping_support.outer = + * VIRTCHNL_VLAN_TOGGLE | + * VIRTCHNL_VLAN_ETHERTYPE_8100 | + * VIRTCHNL_VLAN_ETHERTYPE_88A8 | + * VIRTCHNL_VLAN_ETHERTYPE_AND; + * + * virtchnl_vlan_caps.offloads.insertion_support.outer = + * VIRTCHNL_VLAN_TOGGLE | + * VIRTCHNL_VLAN_ETHERTYPE_8100 | + * VIRTCHNL_VLAN_ETHERTYPE_88A8 | + * VIRTCHNL_VLAN_ETHERTYPE_AND; + * + * In order to enable outer VLAN stripping for 0x8100 and 0x88a8 VLANs, the VF + * would populate the virthcnl_vlan_offload_structure in the following manner + * and send the VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 message. + * + * virtchnl_vlan_setting.outer_ethertype_setting = + * VIRTHCNL_VLAN_ETHERTYPE_8100 | + * VIRTHCNL_VLAN_ETHERTYPE_88A8; + * + * virtchnl_vlan_setting.vport_id = vport_id or vsi_id assigned to the VF on + * initialization. + * + * There is also the case where a PF and the underlying hardware can support + * VLAN offloads on multiple ethertypes, but not concurrently. For example, if + * the PF populates the virtchnl_vlan_caps.offloads in the following manner the + * VF will be allowed to enable and/or disable 0x8100 XOR 0x88a8 outer VLAN + * offloads. The ethertypes must match for stripping and insertion. + * + * virtchnl_vlan_caps.offloads.stripping_support.outer = + * VIRTCHNL_VLAN_TOGGLE | + * VIRTCHNL_VLAN_ETHERTYPE_8100 | + * VIRTCHNL_VLAN_ETHERTYPE_88A8 | + * VIRTCHNL_VLAN_ETHERTYPE_XOR; + * + * virtchnl_vlan_caps.offloads.insertion_support.outer = + * VIRTCHNL_VLAN_TOGGLE | + * VIRTCHNL_VLAN_ETHERTYPE_8100 | + * VIRTCHNL_VLAN_ETHERTYPE_88A8 | + * VIRTCHNL_VLAN_ETHERTYPE_XOR; + * + * virtchnl_vlan_caps.offloads.ethertype_match = + * VIRTCHNL_ETHERTYPE_STRIPPING_MATCHES_INSERTION; + * + * In order to enable outer VLAN stripping for 0x88a8 VLANs, the VF would + * populate the virtchnl_vlan_setting structure in the following manner and send + * the VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2. Also, this will change the + * ethertype for VLAN insertion if it's enabled. So, for completeness, a + * VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2 with the same ethertype should be sent. + * + * virtchnl_vlan_setting.outer_ethertype_setting = VIRTHCNL_VLAN_ETHERTYPE_88A8; + * + * virtchnl_vlan_setting.vport_id = vport_id or vsi_id assigned to the VF on + * initialization. + */ +struct virtchnl_vlan_setting { + u32 outer_ethertype_setting; + u32 inner_ethertype_setting; + u16 vport_id; + u8 pad[6]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vlan_setting); + /* VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE * VF sends VSI id and flags. * PF returns status code in retval. @@ -1156,6 +1509,30 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, case VIRTCHNL_OP_DEL_FDIR_FILTER: valid_len = sizeof(struct virtchnl_fdir_del); break; + case VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS: + break; + case VIRTCHNL_OP_ADD_VLAN_V2: + case VIRTCHNL_OP_DEL_VLAN_V2: + valid_len = sizeof(struct virtchnl_vlan_filter_list_v2); + if (msglen >= valid_len) { + struct virtchnl_vlan_filter_list_v2 *vfl = + (struct virtchnl_vlan_filter_list_v2 *)msg; + + valid_len += (vfl->num_elements - 1) * + sizeof(struct virtchnl_vlan_filter); + + if (vfl->num_elements == 0) { + err_msg_format = true; + break; + } + } + break; + case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2: + case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2: + case VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2: + case VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2: + valid_len = sizeof(struct virtchnl_vlan_setting); + break; /* These are always errors coming from the VF. */ case VIRTCHNL_OP_EVENT: case VIRTCHNL_OP_UNKNOWN: -- cgit v1.2.3 From d639b9d13a39cf15639cbe6e8b2c43eb60148a73 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:44 -0800 Subject: bpf: Introduce composable reg, ret and arg types. There are some common properties shared between bpf reg, ret and arg values. For instance, a value may be a NULL pointer, or a pointer to a read-only memory. Previously, to express these properties, enumeration was used. For example, in order to test whether a reg value can be NULL, reg_type_may_be_null() simply enumerates all types that are possibly NULL. The problem of this approach is that it's not scalable and causes a lot of duplication. These properties can be combined, for example, a type could be either MAYBE_NULL or RDONLY, or both. This patch series rewrites the layout of reg_type, arg_type and ret_type, so that common properties can be extracted and represented as composable flag. For example, one can write ARG_PTR_TO_MEM | PTR_MAYBE_NULL which is equivalent to the previous ARG_PTR_TO_MEM_OR_NULL The type ARG_PTR_TO_MEM are called "base type" in this patch. Base types can be extended with flags. A flag occupies the higher bits while base types sits in the lower bits. This patch in particular sets up a set of macro for this purpose. The following patches will rewrite arg_types, ret_types and reg_types respectively. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-2-haoluo@google.com --- include/linux/bpf.h | 42 ++++++++++++++++++++++++++++++++++++++++++ include/linux/bpf_verifier.h | 13 +++++++++++++ 2 files changed, 55 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 965fffaf0308..41bb3687cc85 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -297,6 +297,29 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, extern const struct bpf_map_ops bpf_map_offload_ops; +/* bpf_type_flag contains a set of flags that are applicable to the values of + * arg_type, ret_type and reg_type. For example, a pointer value may be null, + * or a memory is read-only. We classify types into two categories: base types + * and extended types. Extended types are base types combined with a type flag. + * + * Currently there are no more than 32 base types in arg_type, ret_type and + * reg_types. + */ +#define BPF_BASE_TYPE_BITS 8 + +enum bpf_type_flag { + /* PTR may be NULL. */ + PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = PTR_MAYBE_NULL, +}; + +/* Max number of base types. */ +#define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) + +/* Max number of all types. */ +#define BPF_TYPE_LIMIT (__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1)) + /* function argument constraints */ enum bpf_arg_type { ARG_DONTCARE = 0, /* unused argument in helper function */ @@ -343,7 +366,13 @@ enum bpf_arg_type { ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ __BPF_ARG_TYPE_MAX, + + /* This must be the last entry. Its purpose is to ensure the enum is + * wide enough to hold the higher bits reserved for bpf_type_flag. + */ + __BPF_ARG_TYPE_LIMIT = BPF_TYPE_LIMIT, }; +static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* type of values returned from helper functions */ enum bpf_return_type { @@ -359,7 +388,14 @@ enum bpf_return_type { RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ + __BPF_RET_TYPE_MAX, + + /* This must be the last entry. Its purpose is to ensure the enum is + * wide enough to hold the higher bits reserved for bpf_type_flag. + */ + __BPF_RET_TYPE_LIMIT = BPF_TYPE_LIMIT, }; +static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL @@ -461,7 +497,13 @@ enum bpf_reg_type { PTR_TO_FUNC, /* reg points to a bpf program function */ PTR_TO_MAP_KEY, /* reg points to a map element key */ __BPF_REG_TYPE_MAX, + + /* This must be the last entry. Its purpose is to ensure the enum is + * wide enough to hold the higher bits reserved for bpf_type_flag. + */ + __BPF_REG_TYPE_LIMIT = BPF_TYPE_LIMIT, }; +static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* The information passed from prog-specific *_is_valid_access * back to the verifier. diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index ee931398f311..34e4ceaca3c7 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -546,5 +546,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, struct bpf_attach_target_info *tgt_info); void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab); +#define BPF_BASE_TYPE_MASK GENMASK(BPF_BASE_TYPE_BITS - 1, 0) + +/* extract base type from bpf_{arg, return, reg}_type. */ +static inline u32 base_type(u32 type) +{ + return type & BPF_BASE_TYPE_MASK; +} + +/* extract flags from an extended type. See bpf_type_flag in bpf.h. */ +static inline u32 type_flag(u32 type) +{ + return type & ~BPF_BASE_TYPE_MASK; +} #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit v1.2.3 From 48946bd6a5d695c50b34546864b79c1f910a33c1 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:45 -0800 Subject: bpf: Replace ARG_XXX_OR_NULL with ARG_XXX | PTR_MAYBE_NULL We have introduced a new type to make bpf_arg composable, by reserving high bits of bpf_arg to represent flags of a type. One of the flags is PTR_MAYBE_NULL which indicates a pointer may be NULL. When applying this flag to an arg_type, it means the arg can take NULL pointer. This patch switches the qualified arg_types to use this flag. The arg_types changed in this patch include: 1. ARG_PTR_TO_MAP_VALUE_OR_NULL 2. ARG_PTR_TO_MEM_OR_NULL 3. ARG_PTR_TO_CTX_OR_NULL 4. ARG_PTR_TO_SOCKET_OR_NULL 5. ARG_PTR_TO_ALLOC_MEM_OR_NULL 6. ARG_PTR_TO_STACK_OR_NULL This patch does not eliminate the use of these arg_types, instead it makes them an alias to the 'ARG_XXX | PTR_MAYBE_NULL'. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-3-haoluo@google.com --- include/linux/bpf.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 41bb3687cc85..765bd7cc4272 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -331,13 +331,11 @@ enum bpf_arg_type { ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ - ARG_PTR_TO_MAP_VALUE_OR_NULL, /* pointer to stack used as map value or NULL */ /* the following constraints used to prototype bpf_memcmp() and other * functions that access data on eBPF program stack */ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ - ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */ ARG_PTR_TO_UNINIT_MEM, /* pointer to memory does not need to be initialized, * helper function must fill all bytes or clear * them in error case. @@ -347,26 +345,31 @@ enum bpf_arg_type { ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ ARG_PTR_TO_CTX, /* pointer to context */ - ARG_PTR_TO_CTX_OR_NULL, /* pointer to context or NULL */ ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ - ARG_PTR_TO_SOCKET_OR_NULL, /* pointer to bpf_sock (fullsock) or NULL */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ - ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ - ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */ + ARG_PTR_TO_STACK, /* pointer to stack */ ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ __BPF_ARG_TYPE_MAX, + /* Extended arg_types. */ + ARG_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE, + ARG_PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MEM, + ARG_PTR_TO_CTX_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_CTX, + ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, + ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM, + ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, + /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ -- cgit v1.2.3 From 3c4807322660d4290ac9062c034aed6b87243861 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:46 -0800 Subject: bpf: Replace RET_XXX_OR_NULL with RET_XXX | PTR_MAYBE_NULL We have introduced a new type to make bpf_ret composable, by reserving high bits to represent flags. One of the flag is PTR_MAYBE_NULL, which indicates a pointer may be NULL. When applying this flag to ret_types, it means the returned value could be a NULL pointer. This patch switches the qualified arg_types to use this flag. The ret_types changed in this patch include: 1. RET_PTR_TO_MAP_VALUE_OR_NULL 2. RET_PTR_TO_SOCKET_OR_NULL 3. RET_PTR_TO_TCP_SOCK_OR_NULL 4. RET_PTR_TO_SOCK_COMMON_OR_NULL 5. RET_PTR_TO_ALLOC_MEM_OR_NULL 6. RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL 7. RET_PTR_TO_BTF_ID_OR_NULL This patch doesn't eliminate the use of these names, instead it makes them aliases to 'RET_PTR_TO_XXX | PTR_MAYBE_NULL'. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-4-haoluo@google.com --- include/linux/bpf.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 765bd7cc4272..975a1d5951bd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -382,17 +382,22 @@ enum bpf_return_type { RET_INTEGER, /* function returns integer */ RET_VOID, /* function doesn't return anything */ RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ - RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ - RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ - RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ - RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ - RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ - RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ - RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ + RET_PTR_TO_SOCKET, /* returns a pointer to a socket */ + RET_PTR_TO_TCP_SOCK, /* returns a pointer to a tcp_sock */ + RET_PTR_TO_SOCK_COMMON, /* returns a pointer to a sock_common */ + RET_PTR_TO_ALLOC_MEM, /* returns a pointer to dynamically allocated memory */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ __BPF_RET_TYPE_MAX, + /* Extended ret_types. */ + RET_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE, + RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, + RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, + RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, + RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM, + RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, + /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ -- cgit v1.2.3 From c25b2ae136039ffa820c26138ed4a5e5f3ab3841 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:47 -0800 Subject: bpf: Replace PTR_TO_XXX_OR_NULL with PTR_TO_XXX | PTR_MAYBE_NULL We have introduced a new type to make bpf_reg composable, by allocating bits in the type to represent flags. One of the flags is PTR_MAYBE_NULL which indicates a pointer may be NULL. This patch switches the qualified reg_types to use this flag. The reg_types changed in this patch include: 1. PTR_TO_MAP_VALUE_OR_NULL 2. PTR_TO_SOCKET_OR_NULL 3. PTR_TO_SOCK_COMMON_OR_NULL 4. PTR_TO_TCP_SOCK_OR_NULL 5. PTR_TO_BTF_ID_OR_NULL 6. PTR_TO_MEM_OR_NULL 7. PTR_TO_RDONLY_BUF_OR_NULL 8. PTR_TO_RDWR_BUF_OR_NULL Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211217003152.48334-5-haoluo@google.com --- include/linux/bpf.h | 18 +++++++++--------- include/linux/bpf_verifier.h | 4 ++++ 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 975a1d5951bd..c3de62267b84 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -465,18 +465,15 @@ enum bpf_reg_type { PTR_TO_CTX, /* reg points to bpf_context */ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ PTR_TO_MAP_VALUE, /* reg points to map element value */ - PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ + PTR_TO_MAP_KEY, /* reg points to a map element key */ PTR_TO_STACK, /* reg == frame_pointer + offset */ PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ PTR_TO_SOCKET, /* reg points to struct bpf_sock */ - PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ PTR_TO_SOCK_COMMON, /* reg points to sock_common */ - PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ - PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ /* PTR_TO_BTF_ID points to a kernel struct that does not need @@ -494,18 +491,21 @@ enum bpf_reg_type { * been checked for null. Used primarily to inform the verifier * an explicit null check is required for this struct. */ - PTR_TO_BTF_ID_OR_NULL, PTR_TO_MEM, /* reg points to valid memory region */ - PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */ PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */ - PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */ PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ - PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ PTR_TO_FUNC, /* reg points to a bpf program function */ - PTR_TO_MAP_KEY, /* reg points to a map element key */ __BPF_REG_TYPE_MAX, + /* Extended reg_types. */ + PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MAP_VALUE, + PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCKET, + PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON, + PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK, + PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID, + PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MEM, + /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 34e4ceaca3c7..143401d4c9d9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -18,6 +18,8 @@ * that converting umax_value to int cannot overflow. */ #define BPF_MAX_VAR_SIZ (1 << 29) +/* size of type_str_buf in bpf_verifier. */ +#define TYPE_STR_BUF_LEN 64 /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that @@ -484,6 +486,8 @@ struct bpf_verifier_env { /* Same as scratched_regs but for stack slots */ u64 scratched_stack_slots; u32 prev_log_len, prev_insn_print_len; + /* buffer used in reg_type_str() to generate reg_type string */ + char type_str_buf[TYPE_STR_BUF_LEN]; }; __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, -- cgit v1.2.3 From 20b2aff4bc15bda809f994761d5719827d66c0b4 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:48 -0800 Subject: bpf: Introduce MEM_RDONLY flag This patch introduce a flag MEM_RDONLY to tag a reg value pointing to read-only memory. It makes the following changes: 1. PTR_TO_RDWR_BUF -> PTR_TO_BUF 2. PTR_TO_RDONLY_BUF -> PTR_TO_BUF | MEM_RDONLY Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-6-haoluo@google.com --- include/linux/bpf.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c3de62267b84..126048110bdb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -311,7 +311,10 @@ enum bpf_type_flag { /* PTR may be NULL. */ PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = PTR_MAYBE_NULL, + /* MEM is read-only. */ + MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = MEM_RDONLY, }; /* Max number of base types. */ @@ -492,8 +495,7 @@ enum bpf_reg_type { * an explicit null check is required for this struct. */ PTR_TO_MEM, /* reg points to valid memory region */ - PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */ - PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ + PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ PTR_TO_FUNC, /* reg points to a bpf program function */ __BPF_REG_TYPE_MAX, -- cgit v1.2.3 From cf9f2f8d62eca810afbd1ee6cc0800202b000e57 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:49 -0800 Subject: bpf: Convert PTR_TO_MEM_OR_NULL to composable types. Remove PTR_TO_MEM_OR_NULL and replace it with PTR_TO_MEM combined with flag PTR_MAYBE_NULL. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-7-haoluo@google.com --- include/linux/bpf.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 126048110bdb..567d83bf28f9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -506,7 +506,6 @@ enum bpf_reg_type { PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON, PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK, PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID, - PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MEM, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. -- cgit v1.2.3 From 216e3cd2f28dbbf1fe86848e0e29e6693b9f0a20 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:51 -0800 Subject: bpf: Add MEM_RDONLY for helper args that are pointers to rdonly mem. Some helper functions may modify its arguments, for example, bpf_d_path, bpf_get_stack etc. Previously, their argument types were marked as ARG_PTR_TO_MEM, which is compatible with read-only mem types, such as PTR_TO_RDONLY_BUF. Therefore it's legitimate, but technically incorrect, to modify a read-only memory by passing it into one of such helper functions. This patch tags the bpf_args compatible with immutable memory with MEM_RDONLY flag. The arguments that don't have this flag will be only compatible with mutable memory types, preventing the helper from modifying a read-only memory. The bpf_args that have MEM_RDONLY are compatible with both mutable memory and immutable memory. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-9-haoluo@google.com --- include/linux/bpf.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 567d83bf28f9..26753139d5b4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -311,7 +311,9 @@ enum bpf_type_flag { /* PTR may be NULL. */ PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS), - /* MEM is read-only. */ + /* MEM is read-only. When applied on bpf_arg, it indicates the arg is + * compatible with both mutable and immutable memory. + */ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), __BPF_TYPE_LAST_FLAG = MEM_RDONLY, -- cgit v1.2.3 From 8cbfe939abe905280279e84a297b1cb34e0d0ec9 Mon Sep 17 00:00:00 2001 From: Baowen Zheng Date: Fri, 17 Dec 2021 19:16:22 +0100 Subject: flow_offload: allow user to offload tc action to net device Use flow_indr_dev_register/flow_indr_dev_setup_offload to offload tc action. We need to call tc_cleanup_flow_action to clean up tc action entry since in tc_setup_action, some actions may hold dev refcnt, especially the mirror action. Signed-off-by: Baowen Zheng Signed-off-by: Louis Peens Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a419718612c6..8b0bdeb4734e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -920,6 +920,7 @@ enum tc_setup_type { TC_SETUP_QDISC_TBF, TC_SETUP_QDISC_FIFO, TC_SETUP_QDISC_HTB, + TC_SETUP_ACT, }; /* These structures hold the attributes of bpf state that are being passed -- cgit v1.2.3 From 5bc9a9dd75351023793d8aa4116ead005d659729 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sun, 19 Dec 2021 21:51:24 +0200 Subject: rfkill: allow to get the software rfkill state iwlwifi needs to be able to differentiate between the software rfkill state and the hardware rfkill state. The reason for this is that iwlwifi needs to notify any change in the software rfkill state even when it doesn't own the device (which means even when the hardware rfkill is asserted). In order to be able to know the software rfkill when the host does not own the device, iwlwifi needs to be able to ask the state of the software rfkill ignoring the state of the hardware rfkill. Signed-off-by: Emmanuel Grumbach Link: https://lore.kernel.org/r/20211219195124.125689-1-emmanuel.grumbach@intel.com Signed-off-by: Johannes Berg --- include/linux/rfkill.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h index 231e06b74b50..c35f3962dc4f 100644 --- a/include/linux/rfkill.h +++ b/include/linux/rfkill.h @@ -229,6 +229,13 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw); */ bool rfkill_blocked(struct rfkill *rfkill); +/** + * rfkill_soft_blocked - Query soft rfkill block state + * + * @rfkill: rfkill struct to query + */ +bool rfkill_soft_blocked(struct rfkill *rfkill); + /** * rfkill_find_type - Helper for finding rfkill type by name * @name: the name of the type -- cgit v1.2.3 From 30be4551f9e26292599e666985119a5b559a2e4a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 22 Dec 2021 18:32:56 +0200 Subject: wwan: Replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. Signed-off-by: Andy Shevchenko Reviewed-by: Sergey Ryazanov Signed-off-by: David S. Miller --- include/linux/wwan.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wwan.h b/include/linux/wwan.h index e143c88bf4b0..afb3334ec8c5 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -4,12 +4,9 @@ #ifndef __WWAN_H #define __WWAN_H -#include -#include #include -#include -#include #include +#include /** * enum wwan_port_type - WWAN port types @@ -37,6 +34,10 @@ enum wwan_port_type { WWAN_PORT_UNKNOWN, }; +struct device; +struct file; +struct netlink_ext_ack; +struct sk_buff; struct wwan_port; /** struct wwan_port_ops - The WWAN port operations -- cgit v1.2.3 From b6459415b384cb829f0b2a4268f211c789f6cf0b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 28 Dec 2021 16:49:13 -0800 Subject: net: Don't include filter.h from net/sock.h sock.h is pretty heavily used (5k objects rebuilt on x86 after it's touched). We can drop the include of filter.h from it and add a forward declaration of struct sk_filter instead. This decreases the number of rebuilt objects when bpf.h is touched from ~5k to ~1k. There's a lot of missing includes this was masking. Primarily in networking tho, this time. Signed-off-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Acked-by: Marc Kleine-Budde Acked-by: Florian Fainelli Acked-by: Nikolay Aleksandrov Acked-by: Stefano Garzarella Link: https://lore.kernel.org/bpf/20211229004913.513372-1-kuba@kernel.org --- include/linux/bpf_local_storage.h | 1 + include/linux/dsa/loop.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 24496bc28e7b..a2b625960ffe 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -8,6 +8,7 @@ #define _BPF_LOCAL_STORAGE_H #include +#include #include #include #include diff --git a/include/linux/dsa/loop.h b/include/linux/dsa/loop.h index 5a3470bcc8a7..b8fef35591aa 100644 --- a/include/linux/dsa/loop.h +++ b/include/linux/dsa/loop.h @@ -2,6 +2,7 @@ #ifndef DSA_LOOP_H #define DSA_LOOP_H +#include #include #include #include -- cgit v1.2.3 From 0fe4b381a59ebc53522fce579b281a67a9e1bee6 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 24 Dec 2021 15:29:15 +0000 Subject: bpf: Allow bpf_local_storage to be used by sleepable programs Other maps like hashmaps are already available to sleepable programs. Sleepable BPF programs run under trace RCU. Allow task, sk and inode storage to be used from sleepable programs. This allows sleepable and non-sleepable programs to provide shareable annotations on kernel objects. Sleepable programs run in trace RCU where as non-sleepable programs run in a normal RCU critical section i.e. __bpf_prog_enter{_sleepable} and __bpf_prog_exit{_sleepable}) (rcu_read_lock or rcu_read_lock_trace). In order to make the local storage maps accessible to both sleepable and non-sleepable programs, one needs to call both call_rcu_tasks_trace and call_rcu to wait for both trace and classical RCU grace periods to expire before freeing memory. Paul's work on call_rcu_tasks_trace allows us to have per CPU queueing for call_rcu_tasks_trace. This behaviour can be achieved by setting rcupdate.rcu_task_enqueue_lim= boot parameter. In light of these new performance changes and to keep the local storage code simple, avoid adding a new flag for sleepable maps / local storage to select the RCU synchronization (trace / classical). Also, update the dereferencing of the pointers to use rcu_derference_check (with either the trace or normal RCU locks held) with a common bpf_rcu_lock_held helper method. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20211224152916.1550677-2-kpsingh@kernel.org --- include/linux/bpf_local_storage.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index a2b625960ffe..37b3906af8b1 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -17,6 +17,9 @@ #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 +#define bpf_rcu_lock_held() \ + (rcu_read_lock_held() || rcu_read_lock_trace_held() || \ + rcu_read_lock_bh_held()) struct bpf_local_storage_map_bucket { struct hlist_head list; raw_spinlock_t lock; @@ -162,4 +165,6 @@ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags); +void bpf_local_storage_free_rcu(struct rcu_head *rcu); + #endif /* _BPF_LOCAL_STORAGE_H */ -- cgit v1.2.3 From aebb51ec3db2a871d74b4afad3f9914812acf120 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 Dec 2021 17:27:42 -0800 Subject: bpf: Invert the dependency between bpf-netns.h and netns/bpf.h netns/bpf.h gets included by netdevice.h (thru net_namespace.h) which in turn gets included in a lot of places. We should keep netns/bpf.h as light-weight as possible. bpf-netns.h seems to contain more implementation details than deserves to be included in a netns header. It needs to pull in uapi/bpf.h to get various enum types. Move enum netns_bpf_attach_type to netns/bpf.h and invert the dependency. This makes netns/bpf.h fit the mold of a struct definition header more clearly, and drops the number of objects rebuilt when uapi/bpf.h is touched from 7.7k to 1.1k. Signed-off-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211230012742.770642-3-kuba@kernel.org --- include/linux/bpf-netns.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h index 722f799c1a2e..413cfa5e4b07 100644 --- a/include/linux/bpf-netns.h +++ b/include/linux/bpf-netns.h @@ -3,15 +3,9 @@ #define _BPF_NETNS_H #include +#include #include -enum netns_bpf_attach_type { - NETNS_BPF_INVALID = -1, - NETNS_BPF_FLOW_DISSECTOR = 0, - NETNS_BPF_SK_LOOKUP, - MAX_NETNS_BPF_ATTACH_TYPE -}; - static inline enum netns_bpf_attach_type to_netns_bpf_attach_type(enum bpf_attach_type attach_type) { -- cgit v1.2.3 From d6c6d0bb2cb3af91c9c1546af7cdf4770d0df443 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 29 Dec 2021 12:36:20 +0100 Subject: net: remove references to CONFIG_IRDA in network header files Commit d64c2a76123f ("staging: irda: remove the irda network stack and drivers") removes the config IRDA. Remove the remaining references to this non-existing config in the network header files. Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20211229113620.19368-1-lukas.bulwahn@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/atalk.h | 2 +- include/linux/netdevice.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/atalk.h b/include/linux/atalk.h index f6034ba774be..a55bfc6567d0 100644 --- a/include/linux/atalk.h +++ b/include/linux/atalk.h @@ -113,7 +113,7 @@ extern int aarp_proto_init(void); /* Inter module exports */ /* Give a device find its atif control structure */ -#if IS_ENABLED(CONFIG_IRDA) || IS_ENABLED(CONFIG_ATALK) +#if IS_ENABLED(CONFIG_ATALK) static inline struct atalk_iface *atalk_find_dev(struct net_device *dev) { return dev->atalk_ptr; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2684bdb6defa..6f99c8f51b60 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2098,7 +2098,7 @@ struct net_device { #if IS_ENABLED(CONFIG_TIPC) struct tipc_bearer __rcu *tipc_ptr; #endif -#if IS_ENABLED(CONFIG_IRDA) || IS_ENABLED(CONFIG_ATALK) +#if IS_ENABLED(CONFIG_ATALK) void *atalk_ptr; #endif struct in_device __rcu *ip_ptr; -- cgit v1.2.3 From c3fb0e280b4cdbf382f59eb6b276e4c6047bc219 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 18 Nov 2021 02:32:37 +0200 Subject: net/mlx5: DR, Fix lower case macro prefix "mlx5_" to "MLX5_" Macros prefix should be capital letters - fix the prefix in mlx5_FLEX_PARSER_MPLS_OVER_UDP_ENABLED. Signed-off-by: Yevgeny Kliteynik --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index e9db12aae8f9..18b816b41545 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1291,7 +1291,7 @@ enum { enum { MLX5_FLEX_PARSER_GENEVE_ENABLED = 1 << 3, MLX5_FLEX_PARSER_MPLS_OVER_GRE_ENABLED = 1 << 4, - mlx5_FLEX_PARSER_MPLS_OVER_UDP_ENABLED = 1 << 5, + MLX5_FLEX_PARSER_MPLS_OVER_UDP_ENABLED = 1 << 5, MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED = 1 << 7, MLX5_FLEX_PARSER_ICMP_V4_ENABLED = 1 << 8, MLX5_FLEX_PARSER_ICMP_V6_ENABLED = 1 << 9, -- cgit v1.2.3 From 0f2a6c3b9219bdf497750258cd2ad513f0056b42 Mon Sep 17 00:00:00 2001 From: Muhammad Sammar Date: Sun, 5 Sep 2021 15:16:21 +0300 Subject: net/mlx5: Add misc5 flow table match parameters Add support for misc5 match parameter as per HW spec, this will allow matching on tunnel_header fields. Signed-off-by: Muhammad Sammar Signed-off-by: Yevgeny Kliteynik --- include/linux/mlx5/device.h | 1 + include/linux/mlx5/mlx5_ifc.h | 25 ++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 9c25edfd59a6..604b85dd770a 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1117,6 +1117,7 @@ enum { MLX5_MATCH_MISC_PARAMETERS_2 = 1 << 3, MLX5_MATCH_MISC_PARAMETERS_3 = 1 << 4, MLX5_MATCH_MISC_PARAMETERS_4 = 1 << 5, + MLX5_MATCH_MISC_PARAMETERS_5 = 1 << 6, }; enum { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 18b816b41545..c74c5e147cb9 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -670,6 +670,26 @@ struct mlx5_ifc_fte_match_set_misc4_bits { u8 reserved_at_100[0x100]; }; +struct mlx5_ifc_fte_match_set_misc5_bits { + u8 macsec_tag_0[0x20]; + + u8 macsec_tag_1[0x20]; + + u8 macsec_tag_2[0x20]; + + u8 macsec_tag_3[0x20]; + + u8 tunnel_header_0[0x20]; + + u8 tunnel_header_1[0x20]; + + u8 tunnel_header_2[0x20]; + + u8 tunnel_header_3[0x20]; + + u8 reserved_at_100[0x100]; +}; + struct mlx5_ifc_cmd_pas_bits { u8 pa_h[0x20]; @@ -1839,7 +1859,9 @@ struct mlx5_ifc_fte_match_param_bits { struct mlx5_ifc_fte_match_set_misc4_bits misc_parameters_4; - u8 reserved_at_c00[0x400]; + struct mlx5_ifc_fte_match_set_misc5_bits misc_parameters_5; + + u8 reserved_at_e00[0x200]; }; enum { @@ -5977,6 +5999,7 @@ enum { MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_2 = 0x3, MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_3 = 0x4, MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_4 = 0x5, + MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_5 = 0x6, }; struct mlx5_ifc_query_flow_group_out_bits { -- cgit v1.2.3 From f59464e257bdbd4df6df9a4505d7858a0baf6cf7 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Mon, 8 Nov 2021 02:42:50 +0200 Subject: net/mlx5: DR, Add support for matching on geneve_tlv_option_0_exist field Match on geneve_tlv_option_0_exist field on devices that support STEv1. Signed-off-by: Muhammad Sammar Signed-off-by: Yevgeny Kliteynik --- include/linux/mlx5/mlx5_ifc.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index c74c5e147cb9..deaa0f71213f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -372,7 +372,8 @@ struct mlx5_ifc_flow_table_fields_supported_bits { u8 reserved_at_37[0x9]; u8 geneve_tlv_option_0_data[0x1]; - u8 reserved_at_41[0x4]; + u8 geneve_tlv_option_0_exist[0x1]; + u8 reserved_at_42[0x3]; u8 outer_first_mpls_over_udp[0x4]; u8 outer_first_mpls_over_gre[0x4]; u8 inner_first_mpls[0x4]; @@ -551,7 +552,8 @@ struct mlx5_ifc_fte_match_set_misc_bits { u8 bth_opcode[0x8]; u8 geneve_vni[0x18]; - u8 reserved_at_d8[0x7]; + u8 reserved_at_d8[0x6]; + u8 geneve_tlv_option_0_exist[0x1]; u8 geneve_oam[0x1]; u8 reserved_at_e0[0xc]; -- cgit v1.2.3 From 4ff725e1d4adfd313bc9767523fc8d6e90d50f9c Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Tue, 23 Nov 2021 02:11:12 +0200 Subject: net/mlx5: DR, Ignore modify TTL if device doesn't support it When modifying TTL, packet's csum has to be recalculated. Due to HW issue in ConnectX-5, csum recalculation for modify TTL is supported through a work-around that is specifically enabled by configuration. If the work-around isn't enabled, ignore the modify TTL action rather than adding an unsupported action. Signed-off-by: Yevgeny Kliteynik --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index deaa0f71213f..598ac3bcc901 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -833,7 +833,7 @@ struct mlx5_ifc_flow_table_eswitch_cap_bits { u8 fdb_to_vport_reg_c_id[0x8]; u8 reserved_at_8[0xd]; u8 fdb_modify_header_fwd_to_table[0x1]; - u8 reserved_at_16[0x1]; + u8 fdb_ipv4_ttl_modify[0x1]; u8 flow_source[0x1]; u8 reserved_at_18[0x2]; u8 multi_fdb_encap[0x1]; -- cgit v1.2.3 From e7026f15564fbe0c8b091f218203111f77b84eda Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Tue, 28 Dec 2021 21:03:06 -0800 Subject: net: phy: lynx: refactor Lynx PCS module to use generic phylink_pcs Remove references to lynx_pcs structures so drivers like the Felix DSA can reference alternate PCS drivers. Signed-off-by: Colin Foster Signed-off-by: David S. Miller --- include/linux/pcs-lynx.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pcs-lynx.h b/include/linux/pcs-lynx.h index a6440d6ebe95..5712cc2ce775 100644 --- a/include/linux/pcs-lynx.h +++ b/include/linux/pcs-lynx.h @@ -9,13 +9,10 @@ #include #include -struct lynx_pcs { - struct phylink_pcs pcs; - struct mdio_device *mdio; -}; +struct mdio_device *lynx_get_mdio_device(struct phylink_pcs *pcs); -struct lynx_pcs *lynx_pcs_create(struct mdio_device *mdio); +struct phylink_pcs *lynx_pcs_create(struct mdio_device *mdio); -void lynx_pcs_destroy(struct lynx_pcs *pcs); +void lynx_pcs_destroy(struct phylink_pcs *pcs); #endif /* __LINUX_PCS_LYNX_H */ -- cgit v1.2.3 From cc4b08c31b5c51352f258032cc65e884b3e61e6a Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 7 Dec 2021 21:15:31 +0900 Subject: can: do not increase tx_bytes statistics for RTR frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The actual payload length of the CAN Remote Transmission Request (RTR) frames is always 0, i.e. no payload is transmitted on the wire. However, those RTR frames still use the DLC to indicate the length of the requested frame. As such, net_device_stats::tx_bytes should not be increased when sending RTR frames. The function can_get_echo_skb() already returns the correct length, even for RTR frames (c.f. [1]). However, for historical reasons, the drivers do not use can_get_echo_skb()'s return value and instead, most of them store a temporary length (or dlc) in some local structure or array. Using the return value of can_get_echo_skb() solves the issue. After doing this, such length/dlc fields become unused and so this patch does the adequate cleaning when needed. This patch fixes all the CAN drivers. Finally, can_get_echo_skb() is decorated with the __must_check attribute in order to force future drivers to correctly use its return value (else the compiler would emit a warning). [1] commit ed3320cec279 ("can: dev: __can_get_echo_skb(): fix real payload length return value for RTR frames") Link: https://lore.kernel.org/all/20211207121531.42941-6-mailhol.vincent@wanadoo.fr Cc: Nicolas Ferre Cc: Alexandre Belloni Cc: Ludovic Desroches Cc: Maxime Ripard Cc: Chen-Yu Tsai Cc: Jernej Skrabec Cc: Yasushi SHOJI Cc: Oliver Hartkopp Cc: Stephane Grosjean Cc: Andreas Larsson Tested-by: Jimmy Assarsson # kvaser Signed-off-by: Vincent Mailhol Acked-by: Stefan Mätje # esd_usb2 Tested-by: Stefan Mätje # esd_usb2 [mkl: add conversion for grcan] Signed-off-by: Marc Kleine-Budde --- include/linux/can/skb.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index d311bc369a39..fdb22b00674a 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -21,8 +21,9 @@ int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, unsigned int idx, unsigned int frame_len); struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr, unsigned int *frame_len_ptr); -unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx, - unsigned int *frame_len_ptr); +unsigned int __must_check can_get_echo_skb(struct net_device *dev, + unsigned int idx, + unsigned int *frame_len_ptr); void can_free_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr); struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf); -- cgit v1.2.3 From c9e1d8ed304cc6106c3241add170193995953325 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 14 Dec 2021 01:02:23 +0900 Subject: can: dev: replace can_priv::ctrlmode_static by can_get_static_ctrlmode() The statically enabled features of a CAN controller can be retrieved using below formula: | u32 ctrlmode_static = priv->ctrlmode & ~priv->ctrlmode_supported; As such, there is no need to store this information. This patch remove the field ctrlmode_static of struct can_priv and provides, in replacement, the inline function can_get_static_ctrlmode() which returns the same value. Link: https://lore.kernel.org/all/20211213160226.56219-2-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 45f19d9db5ca..92e2d69462f0 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -69,7 +69,6 @@ struct can_priv { /* CAN controller features - see include/uapi/linux/can/netlink.h */ u32 ctrlmode; /* current options setting */ u32 ctrlmode_supported; /* options that can be modified by netlink */ - u32 ctrlmode_static; /* static enabled options for driver/hardware */ int restart_ms; struct delayed_work restart_work; @@ -139,13 +138,17 @@ static inline void can_set_static_ctrlmode(struct net_device *dev, /* alloc_candev() succeeded => netdev_priv() is valid at this point */ priv->ctrlmode = static_mode; - priv->ctrlmode_static = static_mode; /* override MTU which was set by default in can_setup()? */ if (static_mode & CAN_CTRLMODE_FD) dev->mtu = CANFD_MTU; } +static inline u32 can_get_static_ctrlmode(struct can_priv *priv) +{ + return priv->ctrlmode & ~priv->ctrlmode_supported; +} + void can_setup(struct net_device *dev); struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, -- cgit v1.2.3 From 7d4a101c0bd3c6e5c6e45c705a54f7bc8f6c128d Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 14 Dec 2021 01:02:24 +0900 Subject: can: dev: add sanity check in can_set_static_ctrlmode() Previous patch removed can_priv::ctrlmode_static to replace it with can_get_static_ctrlmode(). A condition sine qua non for this to work is that the controller static modes should never be set in can_priv::ctrlmode_supported (c.f. the comment on can_priv::ctrlmode_supported which states that it is for "options that can be *modified* by netlink"). Also, this condition is already correctly fulfilled by all existing drivers which rely on the ctrlmode_static feature. Nonetheless, we added an extra safeguard in can_set_static_ctrlmode() to return an error value and to warn the developer who would be adventurous enough to set to static a given feature that is already set to supported. The drivers which rely on the static controller mode are then updated to check the return value of can_set_static_ctrlmode(). Link: https://lore.kernel.org/all/20211213160226.56219-3-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 92e2d69462f0..fff3f70df697 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -131,17 +131,24 @@ static inline s32 can_get_relative_tdco(const struct can_priv *priv) } /* helper to define static CAN controller features at device creation time */ -static inline void can_set_static_ctrlmode(struct net_device *dev, - u32 static_mode) +static inline int __must_check can_set_static_ctrlmode(struct net_device *dev, + u32 static_mode) { struct can_priv *priv = netdev_priv(dev); /* alloc_candev() succeeded => netdev_priv() is valid at this point */ + if (priv->ctrlmode_supported & static_mode) { + netdev_warn(dev, + "Controller features can not be supported and static at the same time\n"); + return -EINVAL; + } priv->ctrlmode = static_mode; /* override MTU which was set by default in can_setup()? */ if (static_mode & CAN_CTRLMODE_FD) dev->mtu = CANFD_MTU; + + return 0; } static inline u32 can_get_static_ctrlmode(struct can_priv *priv) -- cgit v1.2.3 From 5fe1be81efd28bf2afcac218f23118dca6b1b648 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 14 Dec 2021 01:02:25 +0900 Subject: can: dev: reorder struct can_priv members for better packing Save eight bytes of holes on x86-64 architectures by reordering the members of struct can_priv. Before: | $ pahole -C can_priv drivers/net/can/dev/dev.o | struct can_priv { | struct net_device * dev; /* 0 8 */ | struct can_device_stats can_stats; /* 8 24 */ | const struct can_bittiming_const * bittiming_const; /* 32 8 */ | const struct can_bittiming_const * data_bittiming_const; /* 40 8 */ | struct can_bittiming bittiming; /* 48 32 */ | /* --- cacheline 1 boundary (64 bytes) was 16 bytes ago --- */ | struct can_bittiming data_bittiming; /* 80 32 */ | const struct can_tdc_const * tdc_const; /* 112 8 */ | struct can_tdc tdc; /* 120 12 */ | /* --- cacheline 2 boundary (128 bytes) was 4 bytes ago --- */ | unsigned int bitrate_const_cnt; /* 132 4 */ | const u32 * bitrate_const; /* 136 8 */ | const u32 * data_bitrate_const; /* 144 8 */ | unsigned int data_bitrate_const_cnt; /* 152 4 */ | u32 bitrate_max; /* 156 4 */ | struct can_clock clock; /* 160 4 */ | unsigned int termination_const_cnt; /* 164 4 */ | const u16 * termination_const; /* 168 8 */ | u16 termination; /* 176 2 */ | | /* XXX 6 bytes hole, try to pack */ | | struct gpio_desc * termination_gpio; /* 184 8 */ | /* --- cacheline 3 boundary (192 bytes) --- */ | u16 termination_gpio_ohms[2]; /* 192 4 */ | enum can_state state; /* 196 4 */ | u32 ctrlmode; /* 200 4 */ | u32 ctrlmode_supported; /* 204 4 */ | int restart_ms; /* 208 4 */ | | /* XXX 4 bytes hole, try to pack */ | | struct delayed_work restart_work; /* 216 88 */ | | /* XXX last struct has 4 bytes of padding */ | | /* --- cacheline 4 boundary (256 bytes) was 48 bytes ago --- */ | int (*do_set_bittiming)(struct net_device *); /* 304 8 */ | int (*do_set_data_bittiming)(struct net_device *); /* 312 8 */ | /* --- cacheline 5 boundary (320 bytes) --- */ | int (*do_set_mode)(struct net_device *, enum can_mode); /* 320 8 */ | int (*do_set_termination)(struct net_device *, u16); /* 328 8 */ | int (*do_get_state)(const struct net_device *, enum can_state *); /* 336 8 */ | int (*do_get_berr_counter)(const struct net_device *, struct can_berr_counter *); /* 344 8 */ | unsigned int echo_skb_max; /* 352 4 */ | | /* XXX 4 bytes hole, try to pack */ | | struct sk_buff * * echo_skb; /* 360 8 */ | | /* size: 368, cachelines: 6, members: 32 */ | /* sum members: 354, holes: 3, sum holes: 14 */ | /* paddings: 1, sum paddings: 4 */ | /* last cacheline: 48 bytes */ | }; After: | $ pahole -C can_priv drivers/net/can/dev/dev.o | struct can_priv { | struct net_device * dev; /* 0 8 */ | struct can_device_stats can_stats; /* 8 24 */ | const struct can_bittiming_const * bittiming_const; /* 32 8 */ | const struct can_bittiming_const * data_bittiming_const; /* 40 8 */ | struct can_bittiming bittiming; /* 48 32 */ | /* --- cacheline 1 boundary (64 bytes) was 16 bytes ago --- */ | struct can_bittiming data_bittiming; /* 80 32 */ | const struct can_tdc_const * tdc_const; /* 112 8 */ | struct can_tdc tdc; /* 120 12 */ | /* --- cacheline 2 boundary (128 bytes) was 4 bytes ago --- */ | unsigned int bitrate_const_cnt; /* 132 4 */ | const u32 * bitrate_const; /* 136 8 */ | const u32 * data_bitrate_const; /* 144 8 */ | unsigned int data_bitrate_const_cnt; /* 152 4 */ | u32 bitrate_max; /* 156 4 */ | struct can_clock clock; /* 160 4 */ | unsigned int termination_const_cnt; /* 164 4 */ | const u16 * termination_const; /* 168 8 */ | u16 termination; /* 176 2 */ | | /* XXX 6 bytes hole, try to pack */ | | struct gpio_desc * termination_gpio; /* 184 8 */ | /* --- cacheline 3 boundary (192 bytes) --- */ | u16 termination_gpio_ohms[2]; /* 192 4 */ | unsigned int echo_skb_max; /* 196 4 */ | struct sk_buff * * echo_skb; /* 200 8 */ | enum can_state state; /* 208 4 */ | u32 ctrlmode; /* 212 4 */ | u32 ctrlmode_supported; /* 216 4 */ | int restart_ms; /* 220 4 */ | struct delayed_work restart_work; /* 224 88 */ | | /* XXX last struct has 4 bytes of padding */ | | /* --- cacheline 4 boundary (256 bytes) was 56 bytes ago --- */ | int (*do_set_bittiming)(struct net_device *); /* 312 8 */ | /* --- cacheline 5 boundary (320 bytes) --- */ | int (*do_set_data_bittiming)(struct net_device *); /* 320 8 */ | int (*do_set_mode)(struct net_device *, enum can_mode); /* 328 8 */ | int (*do_set_termination)(struct net_device *, u16); /* 336 8 */ | int (*do_get_state)(const struct net_device *, enum can_state *); /* 344 8 */ | int (*do_get_berr_counter)(const struct net_device *, struct can_berr_counter *); /* 352 8 */ | | /* size: 360, cachelines: 6, members: 32 */ | /* sum members: 354, holes: 1, sum holes: 6 */ | /* paddings: 1, sum paddings: 4 */ | /* last cacheline: 40 bytes */ | }; Link: https://lore.kernel.org/all/20211213160226.56219-4-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index fff3f70df697..c2ea47f30046 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -64,6 +64,9 @@ struct can_priv { struct gpio_desc *termination_gpio; u16 termination_gpio_ohms[CAN_TERMINATION_GPIO_MAX]; + unsigned int echo_skb_max; + struct sk_buff **echo_skb; + enum can_state state; /* CAN controller features - see include/uapi/linux/can/netlink.h */ @@ -83,9 +86,6 @@ struct can_priv { struct can_berr_counter *bec); int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv); - unsigned int echo_skb_max; - struct sk_buff **echo_skb; - #ifdef CONFIG_CAN_LEDS struct led_trigger *tx_led_trig; char tx_led_trig_name[CAN_LED_NAME_SZ]; -- cgit v1.2.3 From c6af53f038aa32cec12e8a305ba07c7ef168f1b0 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 4 Jan 2022 12:07:00 +0000 Subject: net: mdio: add helpers to extract clause 45 regad and devad fields Add a couple of helpers and definitions to extract the clause 45 regad and devad fields from the regnum passed into MDIO drivers. Tested-by: Daniel Golle Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Signed-off-by: Daniel Golle Signed-off-by: David S. Miller --- include/linux/mdio.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 9f3587a61e14..ecac96d52e01 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -7,6 +7,7 @@ #define __LINUX_MDIO_H__ #include +#include #include /* Or MII_ADDR_C45 into regnum for read/write on mii_bus to enable the 21 bit @@ -14,6 +15,7 @@ */ #define MII_ADDR_C45 (1<<30) #define MII_DEVADDR_C45_SHIFT 16 +#define MII_DEVADDR_C45_MASK GENMASK(20, 16) #define MII_REGADDR_C45_MASK GENMASK(15, 0) struct gpio_desc; @@ -381,6 +383,16 @@ static inline u32 mdiobus_c45_addr(int devad, u16 regnum) return MII_ADDR_C45 | devad << MII_DEVADDR_C45_SHIFT | regnum; } +static inline u16 mdiobus_c45_regad(u32 regnum) +{ + return FIELD_GET(MII_REGADDR_C45_MASK, regnum); +} + +static inline u16 mdiobus_c45_devad(u32 regnum) +{ + return FIELD_GET(MII_DEVADDR_C45_MASK, regnum); +} + static inline int __mdiobus_c45_read(struct mii_bus *bus, int prtad, int devad, u16 regnum) { -- cgit v1.2.3 From d53ad5d8b218a885e95080d4d3d556b16b91b1b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 3 Jan 2022 16:08:09 +0100 Subject: xdp: Move conversion to xdp_frame out of map functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All map redirect functions except XSK maps convert xdp_buff to xdp_frame before enqueueing it. So move this conversion of out the map functions and into xdp_do_redirect(). This removes a bit of duplicated code, but more importantly it makes it possible to support caller-allocated xdp_frame structures, which will be added in a subsequent commit. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220103150812.87914-5-toke@redhat.com --- include/linux/bpf.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 26753139d5b4..6e947cd91152 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1669,17 +1669,17 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); struct btf *bpf_get_btf_vmlinux(void); /* Map specifics */ -struct xdp_buff; +struct xdp_frame; struct sk_buff; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; void __dev_flush(void); -int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, +int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx); -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx); -int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, +int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); @@ -1688,7 +1688,7 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, bool exclude_ingress); void __cpu_map_flush(void); -int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, +int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx); int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, struct sk_buff *skb); @@ -1866,26 +1866,26 @@ static inline void __dev_flush(void) { } -struct xdp_buff; +struct xdp_frame; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; static inline -int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, +int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; } static inline -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; } static inline -int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, +int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress) { return 0; @@ -1913,7 +1913,7 @@ static inline void __cpu_map_flush(void) } static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, - struct xdp_buff *xdp, + struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; -- cgit v1.2.3 From 1372d34ccf6dd480332b2bcb2fd59a2b9a0df415 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 3 Jan 2022 16:08:10 +0100 Subject: xdp: Add xdp_do_redirect_frame() for pre-computed xdp_frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an xdp_do_redirect_frame() variant which supports pre-computed xdp_frame structures. This will be used in bpf_prog_run() to avoid having to write to the xdp_frame structure when the XDP program doesn't modify the frame boundaries. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220103150812.87914-6-toke@redhat.com --- include/linux/filter.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 60eec80fa1d4..71fa57b88bfc 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1019,6 +1019,10 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *prog); +int xdp_do_redirect_frame(struct net_device *dev, + struct xdp_buff *xdp, + struct xdp_frame *xdpf, + struct bpf_prog *prog); void xdp_do_flush(void); /* The xdp_do_flush_map() helper has been renamed to drop the _map suffix, as -- cgit v1.2.3 From 007747a984ea5e895b7d8b056b24ebf431e1e71d Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Wed, 5 Jan 2022 11:33:26 +0100 Subject: net: fix SOF_TIMESTAMPING_BIND_PHC to work with multiple sockets When multiple sockets using the SOF_TIMESTAMPING_BIND_PHC flag received a packet with a hardware timestamp (e.g. multiple PTP instances in different PTP domains using the UDPv4/v6 multicast or L2 transport), the timestamps received on some sockets were corrupted due to repeated conversion of the same timestamp (by the same or different vclocks). Fix ptp_convert_timestamp() to not modify the shared skb timestamp and return the converted timestamp as a ktime_t instead. If the conversion fails, return 0 to not confuse the application with timestamps corresponding to an unexpected PHC. Fixes: d7c088265588 ("net: socket: support hardware timestamp conversion to PHC bound") Signed-off-by: Miroslav Lichvar Cc: Yangbo Lu Cc: Richard Cochran Acked-by: Richard Cochran Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 2e5565067355..554454cb8693 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -351,15 +351,17 @@ int ptp_get_vclocks_index(int pclock_index, int **vclock_index); * * @hwtstamps: skb_shared_hwtstamps structure pointer * @vclock_index: phc index of ptp vclock. + * + * Returns converted timestamp, or 0 on error. */ -void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps, - int vclock_index); +ktime_t ptp_convert_timestamp(const struct skb_shared_hwtstamps *hwtstamps, + int vclock_index); #else static inline int ptp_get_vclocks_index(int pclock_index, int **vclock_index) { return 0; } -static inline void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps, - int vclock_index) -{ } +static inline ktime_t ptp_convert_timestamp(const struct skb_shared_hwtstamps *hwtstamps, + int vclock_index) +{ return 0; } #endif -- cgit v1.2.3 From eac1b93c14d645ef147b049ace0d5230df755548 Mon Sep 17 00:00:00 2001 From: Coco Li Date: Wed, 5 Jan 2022 02:48:38 -0800 Subject: gro: add ability to control gro max packet size Eric Dumazet suggested to allow users to modify max GRO packet size. We have seen GRO being disabled by users of appliances (such as wifi access points) because of claimed bufferbloat issues, or some work arounds in sch_cake, to split GRO/GSO packets. Instead of disabling GRO completely, one can chose to limit the maximum packet size of GRO packets, depending on their latency constraints. This patch adds a per device gro_max_size attribute that can be changed with ip link command. ip link set dev eth0 gro_max_size 16000 Suggested-by: Eric Dumazet Signed-off-by: Coco Li Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6f99c8f51b60..3213c7227b59 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1942,6 +1942,8 @@ enum netdev_ml_priv_type { * dev->addr_list_lock. * @unlink_list: As netif_addr_lock() can be called recursively, * keep a list of interfaces to be deleted. + * @gro_max_size: Maximum size of aggregated packet in generic + * receive offload (GRO) * * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. * @linkwatch_dev_tracker: refcount tracker used by linkwatch. @@ -2131,6 +2133,8 @@ struct net_device { struct bpf_prog __rcu *xdp_prog; unsigned long gro_flush_timeout; int napi_defer_hard_irqs; +#define GRO_MAX_SIZE 65536 + unsigned int gro_max_size; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; @@ -4806,6 +4810,13 @@ static inline void netif_set_gso_max_segs(struct net_device *dev, WRITE_ONCE(dev->gso_max_segs, segs); } +static inline void netif_set_gro_max_size(struct net_device *dev, + unsigned int size) +{ + /* This pairs with the READ_ONCE() in skb_gro_receive() */ + WRITE_ONCE(dev->gro_max_size, size); +} + static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) -- cgit v1.2.3 From 79b60ca83b6fa63ef307d2edcc77ee6581da8971 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Sun, 12 Dec 2021 14:51:27 +0200 Subject: net/mlx5: Introduce API for bulk request and release of IRQs Currently IRQs are requested one by one. To balance spreading IRQs among cpus using such scheme requires remembering cpu mask for the cpus used for a given device. This complicates the IRQ allocation scheme in subsequent patch. Hence, prepare the code for bulk IRQs allocation. This enables spreading IRQs among cpus in subsequent patch. Signed-off-by: Shay Drory Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- include/linux/mlx5/eq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h index ea3ff5a8ced3..3705a382276b 100644 --- a/include/linux/mlx5/eq.h +++ b/include/linux/mlx5/eq.h @@ -9,13 +9,13 @@ #define MLX5_NUM_SPARE_EQE (0x80) struct mlx5_eq; +struct mlx5_irq; struct mlx5_core_dev; struct mlx5_eq_param { - u8 irq_index; int nent; u64 mask[4]; - cpumask_var_t affinity; + struct mlx5_irq *irq; }; struct mlx5_eq * -- cgit v1.2.3 From 719774377622bc4025d2a74f551b5dc2158c6c30 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 05:03:22 +0100 Subject: netfilter: conntrack: convert to refcount_t api Convert nf_conn reference counting from atomic_t to refcount_t based api. refcount_t api provides more runtime sanity checks and will warn on certain constructs, e.g. refcount_inc() on a zero reference count, which usually indicates use-after-free. For this reason template allocation is changed to init the refcount to 1, the subsequenct add operations are removed. Likewise, init_conntrack() is changed to set the initial refcount to 1 instead refcount_inc(). This is safe because the new entry is not (yet) visible to other cpus. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_common.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 700ea077ce2d..a03f7a80b9ab 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -2,7 +2,7 @@ #ifndef _NF_CONNTRACK_COMMON_H #define _NF_CONNTRACK_COMMON_H -#include +#include #include struct ip_conntrack_stat { @@ -25,19 +25,19 @@ struct ip_conntrack_stat { #define NFCT_PTRMASK ~(NFCT_INFOMASK) struct nf_conntrack { - atomic_t use; + refcount_t use; }; void nf_conntrack_destroy(struct nf_conntrack *nfct); static inline void nf_conntrack_put(struct nf_conntrack *nfct) { - if (nfct && atomic_dec_and_test(&nfct->use)) + if (nfct && refcount_dec_and_test(&nfct->use)) nf_conntrack_destroy(nfct); } static inline void nf_conntrack_get(struct nf_conntrack *nfct) { if (nfct) - atomic_inc(&nfct->use); + refcount_inc(&nfct->use); } #endif /* _NF_CONNTRACK_COMMON_H */ -- cgit v1.2.3 From 3fce16493dc1aa2c9af3d7e7bd360dfe203a3e6a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 05:03:23 +0100 Subject: netfilter: core: move ip_ct_attach indirection to struct nf_ct_hook ip_ct_attach predates struct nf_ct_hook, we can place it there and remove the exported symbol. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 3fda1a508733..e0e3f3355ab1 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -440,7 +440,6 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include -extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu; void nf_ct_attach(struct sk_buff *, const struct sk_buff *); struct nf_conntrack_tuple; bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, @@ -463,6 +462,7 @@ struct nf_ct_hook { void (*destroy)(struct nf_conntrack *); bool (*get_tuple_skb)(struct nf_conntrack_tuple *, const struct sk_buff *); + void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); }; extern struct nf_ct_hook __rcu *nf_ct_hook; -- cgit v1.2.3 From 285c8a7a58158cb1805c97ff03875df2ba2ea1fe Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 05:03:24 +0100 Subject: netfilter: make function op structures const No functional changes, these structures should be const. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index e0e3f3355ab1..15e71bfff726 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -381,13 +381,13 @@ struct nf_nat_hook { enum ip_conntrack_dir dir); }; -extern struct nf_nat_hook __rcu *nf_nat_hook; +extern const struct nf_nat_hook __rcu *nf_nat_hook; static inline void nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) { #if IS_ENABLED(CONFIG_NF_NAT) - struct nf_nat_hook *nat_hook; + const struct nf_nat_hook *nat_hook; rcu_read_lock(); nat_hook = rcu_dereference(nf_nat_hook); @@ -464,7 +464,7 @@ struct nf_ct_hook { const struct sk_buff *); void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); }; -extern struct nf_ct_hook __rcu *nf_ct_hook; +extern const struct nf_ct_hook __rcu *nf_ct_hook; struct nlattr; @@ -479,7 +479,7 @@ struct nfnl_ct_hook { void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off); }; -extern struct nfnl_ct_hook __rcu *nfnl_ct_hook; +extern const struct nfnl_ct_hook __rcu *nfnl_ct_hook; /** * nf_skb_duplicated - TEE target has sent a packet -- cgit v1.2.3 From 6ae7989c9af0d98ab64196f4f4c6f6499454bd23 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 05:03:25 +0100 Subject: netfilter: conntrack: avoid useless indirection during conntrack destruction nf_ct_put() results in a usesless indirection: nf_ct_put -> nf_conntrack_put -> nf_conntrack_destroy -> rcu readlock + indirect call of ct_hooks->destroy(). There are two _put helpers: nf_ct_put and nf_conntrack_put. The latter is what should be used in code that MUST NOT cause a linker dependency on the conntrack module (e.g. calls from core network stack). Everyone else should call nf_ct_put() instead. A followup patch will convert a few nf_conntrack_put() calls to nf_ct_put(), in particular from modules that already have a conntrack dependency such as act_ct or even nf_conntrack itself. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_common.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index a03f7a80b9ab..2770db2fa080 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -29,6 +29,8 @@ struct nf_conntrack { }; void nf_conntrack_destroy(struct nf_conntrack *nfct); + +/* like nf_ct_put, but without module dependency on nf_conntrack */ static inline void nf_conntrack_put(struct nf_conntrack *nfct) { if (nfct && refcount_dec_and_test(&nfct->use)) -- cgit v1.2.3 From 6316136ec6e3dd1c302f7e7289a9ee46ecc610ae Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 15:46:16 +0100 Subject: netfilter: egress: avoid a lockdep splat include/linux/netfilter_netdev.h:97 suspicious rcu_dereference_check() usage! 2 locks held by sd-resolve/1100: 0: ..(rcu_read_lock_bh){1:3}, at: ip_finish_output2 1: ..(rcu_read_lock_bh){1:3}, at: __dev_queue_xmit __dev_queue_xmit+0 .. The helper has two callers, one uses rcu_read_lock, the other rcu_read_lock_bh(). Annotate the dereference to reflect this. Fixes: 42df6e1d221dd ("netfilter: Introduce egress hook") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_netdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h index b71b57a83bb4..b4dd96e4dc8d 100644 --- a/include/linux/netfilter_netdev.h +++ b/include/linux/netfilter_netdev.h @@ -94,7 +94,7 @@ static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, return skb; #endif - e = rcu_dereference(dev->nf_hooks_egress); + e = rcu_dereference_check(dev->nf_hooks_egress, rcu_read_lock_bh_held()); if (!e) return skb; -- cgit v1.2.3 From 6f022c2ddbcefaee79502ce5386dfe351d457070 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Thu, 6 Jan 2022 17:38:04 +0200 Subject: net: openvswitch: Fix ct_state nat flags for conns arriving from tc Netfilter conntrack maintains NAT flags per connection indicating whether NAT was configured for the connection. Openvswitch maintains NAT flags on the per packet flow key ct_state field, indicating whether NAT was actually executed on the packet. When a packet misses from tc to ovs the conntrack NAT flags are set. However, NAT was not necessarily executed on the packet because the connection's state might still be in NEW state. As such, openvswitch wrongly assumes that NAT was executed and sets an incorrect flow key NAT flags. Fix this, by flagging to openvswitch which NAT was actually done in act_ct via tc_skb_ext and tc_skb_cb to the openvswitch module, so the packet flow key NAT flags will be correctly set. Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct") Signed-off-by: Paul Blakey Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20220106153804.26451-1-paulb@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4507d77d6941..60ab0c2fe567 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -287,7 +287,9 @@ struct tc_skb_ext { __u32 chain; __u16 mru; __u16 zone; - bool post_ct; + u8 post_ct:1; + u8 post_ct_snat:1; + u8 post_ct_dnat:1; }; #endif -- cgit v1.2.3 From c504e5c2f9648a1e5c2be01e8c3f59d394192bd3 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 9 Jan 2022 14:36:26 +0800 Subject: net: skb: introduce kfree_skb_reason() Introduce the interface kfree_skb_reason(), which is able to pass the reason why the skb is dropped to 'kfree_skb' tracepoint. Add the 'reason' field to 'trace_kfree_skb', therefor user can get more detail information about abnormal skb with 'drop_monitor' or eBPF. All drop reasons are defined in the enum 'skb_drop_reason', and they will be print as string in 'kfree_skb' tracepoint in format of 'reason: XXX'. ( Maybe the reasons should be defined in a uapi header file, so that user space can use them? ) Signed-off-by: Menglong Dong Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 642acb0d1646..ef0870abc791 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -305,6 +305,17 @@ struct sk_buff_head { struct sk_buff; +/* The reason of skb drop, which is used in kfree_skb_reason(). + * en...maybe they should be splited by group? + * + * Each item here should also be in 'TRACE_SKB_DROP_REASON', which is + * used to translate the reason to string. + */ +enum skb_drop_reason { + SKB_DROP_REASON_NOT_SPECIFIED, + SKB_DROP_REASON_MAX, +}; + /* To allow 64K frame to be packed as single skb without frag_list we * require 64K/PAGE_SIZE pages plus 1 additional page to allow for * buffers which do not start on a page boundary. @@ -1085,8 +1096,18 @@ static inline bool skb_unref(struct sk_buff *skb) return true; } +void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason); + +/** + * kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason + * @skb: buffer to free + */ +static inline void kfree_skb(struct sk_buff *skb) +{ + kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); +} + void skb_release_head_state(struct sk_buff *skb); -void kfree_skb(struct sk_buff *skb); void kfree_skb_list(struct sk_buff *segs); void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt); void skb_tx_error(struct sk_buff *skb); -- cgit v1.2.3 From 85125597419aec3aa7b8f3b8713e415f997796f2 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 9 Jan 2022 14:36:27 +0800 Subject: net: skb: use kfree_skb_reason() in tcp_v4_rcv() Replace kfree_skb() with kfree_skb_reason() in tcp_v4_rcv(). Following drop reasons are added: SKB_DROP_REASON_NO_SOCKET SKB_DROP_REASON_PKT_TOO_SMALL SKB_DROP_REASON_TCP_CSUM SKB_DROP_REASON_TCP_FILTER After this patch, 'kfree_skb' event will print message like this: $ TASK-PID CPU# ||||| TIMESTAMP FUNCTION $ | | | ||||| | | -0 [000] ..s1. 36.113438: kfree_skb: skbaddr=(____ptrval____) protocol=2048 location=(____ptrval____) reason: NO_SOCKET The reason of skb drop is printed too. Signed-off-by: Menglong Dong Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ef0870abc791..c9c97b0d0fe9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -313,6 +313,10 @@ struct sk_buff; */ enum skb_drop_reason { SKB_DROP_REASON_NOT_SPECIFIED, + SKB_DROP_REASON_NO_SOCKET, + SKB_DROP_REASON_PKT_TOO_SMALL, + SKB_DROP_REASON_TCP_CSUM, + SKB_DROP_REASON_TCP_FILTER, SKB_DROP_REASON_MAX, }; -- cgit v1.2.3 From 1c7fab70df085d866a3765955f397ca2b4025b15 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 9 Jan 2022 14:36:28 +0800 Subject: net: skb: use kfree_skb_reason() in __udp4_lib_rcv() Replace kfree_skb() with kfree_skb_reason() in __udp4_lib_rcv. New drop reason 'SKB_DROP_REASON_UDP_CSUM' is added for udp csum error. Signed-off-by: Menglong Dong Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c9c97b0d0fe9..af64c7de9b53 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -317,6 +317,7 @@ enum skb_drop_reason { SKB_DROP_REASON_PKT_TOO_SMALL, SKB_DROP_REASON_TCP_CSUM, SKB_DROP_REASON_TCP_FILTER, + SKB_DROP_REASON_UDP_CSUM, SKB_DROP_REASON_MAX, }; -- cgit v1.2.3