From 0b5c21bbc01e92745ca1ca4f6fd87d878fa3ea5e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 4 Apr 2022 11:38:47 +0200 Subject: net: ensure net_todo_list is processed quickly In [1], Will raised a potential issue that the cfg80211 code, which does (from a locking perspective) rtnl_lock() wiphy_lock() rtnl_unlock() might be suspectible to ABBA deadlocks, because rtnl_unlock() calls netdev_run_todo(), which might end up calling rtnl_lock() again, which could then deadlock (see the comment in the code added here for the scenario). Some back and forth and thinking ensued, but clearly this can't happen if the net_todo_list is empty at the rtnl_unlock() here. Clearly, the code here cannot actually put an entry on it, and all other users of rtnl_unlock() will empty it since that will always go through netdev_run_todo(), emptying the list. So the only other way to get there would be to add to the list and then unlock the RTNL without going through rtnl_unlock(), which is only possible through __rtnl_unlock(). However, this isn't exported and not used in many places, and none of them seem to be able to unregister before using it. Therefore, add a WARN_ON() in the code to ensure this invariant won't be broken, so that the cfg80211 (or any similar) code stays safe. [1] https://lore.kernel.org/r/Yjzpo3TfZxtKPMAG@google.com Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20220404113847.0ee02e4a70da.Ic73d206e217db20fd22dcec14fe5442ca732804b@changeid Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 59e27a2b7bf0..b6a1e7f643da 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3894,7 +3894,8 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); extern int netdev_budget; extern unsigned int netdev_budget_usecs; -/* Called by rtnetlink.c:rtnl_unlock() */ +/* Used by rtnetlink.c:__rtnl_unlock()/rtnl_unlock() */ +extern struct list_head net_todo_list; void netdev_run_todo(void); static inline void __dev_put(struct net_device *dev) -- cgit v1.2.3 From 40379a0084c2f65eb62c102f5bbf5cdc14a50410 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 4 Apr 2022 15:08:15 +0300 Subject: net/mlx5_fpga: Drop INNOVA TLS support Mellanox INNOVA TLS cards are EOL in May, 2018 [1]. As such, the code is unmaintained, untested and not in-use by any upstream/distro oriented customers. In order to reduce code complexity, drop the kernel code. [1] https://network.nvidia.com/related-docs/eol/LCR-000286.pdf Link: https://lore.kernel.org/r/b88add368def721ea9d054cb69def72d9e3f67aa.1649073691.git.leonro@nvidia.com Reviewed-by: Tariq Toukan Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc_fpga.h | 63 -------------------------------------- 1 file changed, 63 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h index 07d77323f78a..e3d824f6a309 100644 --- a/include/linux/mlx5/mlx5_ifc_fpga.h +++ b/include/linux/mlx5/mlx5_ifc_fpga.h @@ -54,7 +54,6 @@ enum { enum { MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_IPSEC = 0x2, - MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_TLS = 0x3, }; struct mlx5_ifc_fpga_shell_caps_bits { @@ -387,27 +386,6 @@ struct mlx5_ifc_fpga_destroy_qp_out_bits { u8 reserved_at_40[0x40]; }; -struct mlx5_ifc_tls_extended_cap_bits { - u8 aes_gcm_128[0x1]; - u8 aes_gcm_256[0x1]; - u8 reserved_at_2[0x1e]; - u8 reserved_at_20[0x20]; - u8 context_capacity_total[0x20]; - u8 context_capacity_rx[0x20]; - u8 context_capacity_tx[0x20]; - u8 reserved_at_a0[0x10]; - u8 tls_counter_size[0x10]; - u8 tls_counters_addr_low[0x20]; - u8 tls_counters_addr_high[0x20]; - u8 rx[0x1]; - u8 tx[0x1]; - u8 tls_v12[0x1]; - u8 tls_v13[0x1]; - u8 lro[0x1]; - u8 ipv6[0x1]; - u8 reserved_at_106[0x1a]; -}; - struct mlx5_ifc_ipsec_extended_cap_bits { u8 encapsulation[0x20]; @@ -572,45 +550,4 @@ struct mlx5_ifc_fpga_ipsec_sa { __be16 vid; /* only 12 bits, rest is reserved */ __be16 reserved2; } __packed; - -enum fpga_tls_cmds { - CMD_SETUP_STREAM = 0x1001, - CMD_TEARDOWN_STREAM = 0x1002, - CMD_RESYNC_RX = 0x1003, -}; - -#define MLX5_TLS_1_2 (0) - -#define MLX5_TLS_ALG_AES_GCM_128 (0) -#define MLX5_TLS_ALG_AES_GCM_256 (1) - -struct mlx5_ifc_tls_cmd_bits { - u8 command_type[0x20]; - u8 ipv6[0x1]; - u8 direction_sx[0x1]; - u8 tls_version[0x2]; - u8 reserved[0x1c]; - u8 swid[0x20]; - u8 src_port[0x10]; - u8 dst_port[0x10]; - union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits src_ipv4_src_ipv6; - union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits dst_ipv4_dst_ipv6; - u8 tls_rcd_sn[0x40]; - u8 tcp_sn[0x20]; - u8 tls_implicit_iv[0x20]; - u8 tls_xor_iv[0x40]; - u8 encryption_key[0x100]; - u8 alg[4]; - u8 reserved2[0x1c]; - u8 reserved3[0x4a0]; -}; - -struct mlx5_ifc_tls_resp_bits { - u8 syndrome[0x20]; - u8 stream_id[0x20]; - u8 reserved[0x40]; -}; - -#define MLX5_TLS_COMMAND_SIZE (0x100) - #endif /* MLX5_IFC_FPGA_H */ -- cgit v1.2.3 From 0276bd3a94c072de3f69b5afe6224e488cc76635 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 5 Apr 2022 17:15:16 +0200 Subject: IB/mlx5: Fix undefined behavior due to shift overflowing the constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix: drivers/infiniband/hw/mlx5/main.c: In function ‘translate_eth_legacy_proto_oper’: drivers/infiniband/hw/mlx5/main.c:370:2: error: case label does not reduce to an integer constant case MLX5E_PROT_MASK(MLX5E_50GBASE_KR2): ^~~~ See https://lore.kernel.org/r/YkwQ6%2BtIH8GQpuct@zn.tnic for the gory details as to why it triggers with older gccs only. Link: https://lore.kernel.org/all/20220405151517.29753-11-bp@alien8.de Signed-off-by: Borislav Petkov Cc: Leon Romanovsky Cc: Saeed Mahameed Cc: linux-rdma@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/port.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 28a928b0684b..e96ee1e348cb 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -141,7 +141,7 @@ enum mlx5_ptys_width { MLX5_PTYS_WIDTH_12X = 1 << 4, }; -#define MLX5E_PROT_MASK(link_mode) (1 << link_mode) +#define MLX5E_PROT_MASK(link_mode) (1U << link_mode) #define MLX5_GET_ETH_PROTO(reg, out, ext, field) \ (ext ? MLX5_GET(reg, out, ext_##field) : \ MLX5_GET(reg, out, field)) -- cgit v1.2.3 From f4b41f062c424209e3939a81e6da022e049a45f2 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 4 Apr 2022 18:30:22 +0200 Subject: net: remove noblock parameter from skb_recv_datagram() skb_recv_datagram() has two parameters 'flags' and 'noblock' that are merged inside skb_recv_datagram() by 'flags | (noblock ? MSG_DONTWAIT : 0)' As 'flags' may contain MSG_DONTWAIT as value most callers split the 'flags' into 'flags' and 'noblock' with finally obsolete bit operations like this: skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &rc); And this is not even done consistently with the 'flags' parameter. This patch removes the obsolete and costly splitting into two parameters and only performs bit operations when really needed on the caller side. One missing conversion thankfully reported by kernel test robot. I missed to enable kunit tests to build the mctp code. Reported-by: kernel test robot Signed-off-by: Oliver Hartkopp Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3a30cae8b0a5..2394441fa3dd 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3836,8 +3836,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, struct sk_buff *__skb_recv_datagram(struct sock *sk, struct sk_buff_head *sk_queue, unsigned int flags, int *off, int *err); -struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, - int *err); +struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int *err); __poll_t datagram_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int skb_copy_datagram_iter(const struct sk_buff *from, int offset, -- cgit v1.2.3 From 804775dfc2885e93a0a4b35db1914c2cc25172b5 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 5 Apr 2022 21:57:47 +0200 Subject: net: ethernet: mtk_eth_soc: add support for Wireless Ethernet Dispatch (WED) The Wireless Ethernet Dispatch subsystem on the MT7622 SoC can be configured to intercept and handle access to the DMA queues and PCIe interrupts for a MT7615/MT7915 wireless card. It can manage the internal WDMA (Wireless DMA) controller, which allows ethernet packets to be passed from the packet switch engine (PSE) to the wireless card, bypassing the CPU entirely. This can be used to implement hardware flow offloading from ethernet to WLAN. Signed-off-by: Felix Fietkau Signed-off-by: David S. Miller --- include/linux/soc/mediatek/mtk_wed.h | 131 +++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 include/linux/soc/mediatek/mtk_wed.h (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h new file mode 100644 index 000000000000..7e00cca06709 --- /dev/null +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -0,0 +1,131 @@ +#ifndef __MTK_WED_H +#define __MTK_WED_H + +#include +#include +#include +#include + +#define MTK_WED_TX_QUEUES 2 + +struct mtk_wed_hw; +struct mtk_wdma_desc; + +struct mtk_wed_ring { + struct mtk_wdma_desc *desc; + dma_addr_t desc_phys; + int size; + + u32 reg_base; + void __iomem *wpdma; +}; + +struct mtk_wed_device { +#ifdef CONFIG_NET_MEDIATEK_SOC_WED + const struct mtk_wed_ops *ops; + struct device *dev; + struct mtk_wed_hw *hw; + bool init_done, running; + int wdma_idx; + int irq; + + struct mtk_wed_ring tx_ring[MTK_WED_TX_QUEUES]; + struct mtk_wed_ring txfree_ring; + struct mtk_wed_ring tx_wdma[MTK_WED_TX_QUEUES]; + + struct { + int size; + void **pages; + struct mtk_wdma_desc *desc; + dma_addr_t desc_phys; + } buf_ring; + + /* filled by driver: */ + struct { + struct pci_dev *pci_dev; + + u32 wpdma_phys; + + u16 token_start; + unsigned int nbuf; + + u32 (*init_buf)(void *ptr, dma_addr_t phys, int token_id); + int (*offload_enable)(struct mtk_wed_device *wed); + void (*offload_disable)(struct mtk_wed_device *wed); + } wlan; +#endif +}; + +struct mtk_wed_ops { + int (*attach)(struct mtk_wed_device *dev); + int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring, + void __iomem *regs); + int (*txfree_ring_setup)(struct mtk_wed_device *dev, + void __iomem *regs); + void (*detach)(struct mtk_wed_device *dev); + + void (*stop)(struct mtk_wed_device *dev); + void (*start)(struct mtk_wed_device *dev, u32 irq_mask); + void (*reset_dma)(struct mtk_wed_device *dev); + + u32 (*reg_read)(struct mtk_wed_device *dev, u32 reg); + void (*reg_write)(struct mtk_wed_device *dev, u32 reg, u32 val); + + u32 (*irq_get)(struct mtk_wed_device *dev, u32 mask); + void (*irq_set_mask)(struct mtk_wed_device *dev, u32 mask); +}; + +extern const struct mtk_wed_ops __rcu *mtk_soc_wed_ops; + +static inline int +mtk_wed_device_attach(struct mtk_wed_device *dev) +{ + int ret = -ENODEV; + +#ifdef CONFIG_NET_MEDIATEK_SOC_WED + rcu_read_lock(); + dev->ops = rcu_dereference(mtk_soc_wed_ops); + if (dev->ops) + ret = dev->ops->attach(dev); + else + rcu_read_unlock(); + + if (ret) + dev->ops = NULL; +#endif + + return ret; +} + +#ifdef CONFIG_NET_MEDIATEK_SOC_WED +#define mtk_wed_device_active(_dev) !!(_dev)->ops +#define mtk_wed_device_detach(_dev) (_dev)->ops->detach(_dev) +#define mtk_wed_device_start(_dev, _mask) (_dev)->ops->start(_dev, _mask) +#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs) \ + (_dev)->ops->tx_ring_setup(_dev, _ring, _regs) +#define mtk_wed_device_txfree_ring_setup(_dev, _regs) \ + (_dev)->ops->txfree_ring_setup(_dev, _regs) +#define mtk_wed_device_reg_read(_dev, _reg) \ + (_dev)->ops->reg_read(_dev, _reg) +#define mtk_wed_device_reg_write(_dev, _reg, _val) \ + (_dev)->ops->reg_write(_dev, _reg, _val) +#define mtk_wed_device_irq_get(_dev, _mask) \ + (_dev)->ops->irq_get(_dev, _mask) +#define mtk_wed_device_irq_set_mask(_dev, _mask) \ + (_dev)->ops->irq_set_mask(_dev, _mask) +#else +static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) +{ + return false; +} +#define mtk_wed_device_detach(_dev) do {} while (0) +#define mtk_wed_device_start(_dev, _mask) do {} while (0) +#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs) -ENODEV +#define mtk_wed_device_txfree_ring_setup(_dev, _ring, _regs) -ENODEV +#define mtk_wed_device_reg_read(_dev, _reg) 0 +#define mtk_wed_device_reg_write(_dev, _reg, _val) do {} while (0) +#define mtk_wed_device_irq_get(_dev, _mask) 0 +#define mtk_wed_device_irq_set_mask(_dev, _mask) do {} while (0) +#endif + +#endif -- cgit v1.2.3 From a333215e10cb5d3b1e0685ca117f0e9452215485 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 5 Apr 2022 21:57:48 +0200 Subject: net: ethernet: mtk_eth_soc: implement flow offloading to WED devices This allows hardware flow offloading from Ethernet to WLAN on MT7622 SoC Co-developed-by: Lorenzo Bianconi Signed-off-by: Lorenzo Bianconi Signed-off-by: Felix Fietkau Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b6a1e7f643da..7b2a0b739684 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -862,6 +862,7 @@ enum net_device_path_type { DEV_PATH_BRIDGE, DEV_PATH_PPPOE, DEV_PATH_DSA, + DEV_PATH_MTK_WDMA, }; struct net_device_path { @@ -887,6 +888,12 @@ struct net_device_path { int port; u16 proto; } dsa; + struct { + u8 wdma_idx; + u8 queue; + u16 wcid; + u8 bss; + } mtk_wdma; }; }; -- cgit v1.2.3 From 6264f58ca0e54e41d63c2d00334a48bac28fbf30 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 6 Apr 2022 14:37:54 -0700 Subject: net: extract a few internals from netdevice.h There's a number of functions and static variables used under net/core/ but not from the outside. We currently dump most of them into netdevice.h. That bad for many reasons: - netdevice.h is very cluttered, hard to figure out what the APIs are; - netdevice.h is very long; - we have to touch netdevice.h more which causes expensive incremental builds. Create a header under net/core/ and move some declarations. The new header is also a bit of a catch-all but that's fine, if we create more specific headers people will likely over-think where their declaration fit best. And end up putting them in netdevice.h, again. More work should be done on splitting netdevice.h into more targeted headers, but that'd be more time consuming so small steps. Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 72 ++--------------------------------------------- 1 file changed, 2 insertions(+), 70 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7b2a0b739684..7e7b2a72e473 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -59,7 +59,8 @@ struct dsa_port; struct ip_tunnel_parm; struct macsec_context; struct macsec_ops; - +struct netdev_name_node; +struct sd_flow_limit; struct sfp_bus; /* 802.11 specific */ struct wireless_dev; @@ -1020,16 +1021,6 @@ struct dev_ifalias { struct devlink; struct tlsdev_ops; -struct netdev_name_node { - struct hlist_node hlist; - struct list_head list; - struct net_device *dev; - const char *name; -}; - -int netdev_name_node_alt_create(struct net_device *dev, const char *name); -int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); - struct netdev_net_notifier { struct list_head list; struct notifier_block *nb; @@ -2975,7 +2966,6 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); struct net_device *dev_get_by_napi_id(unsigned int napi_id); -int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); @@ -3034,19 +3024,6 @@ static inline bool dev_has_header(const struct net_device *dev) return dev->header_ops && dev->header_ops->create; } -#ifdef CONFIG_NET_FLOW_LIMIT -#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ -struct sd_flow_limit { - u64 count; - unsigned int num_buckets; - unsigned int history_head; - u16 history[FLOW_LIMIT_HISTORY]; - u8 buckets[]; -}; - -extern int netdev_flow_limit_table_len; -#endif /* CONFIG_NET_FLOW_LIMIT */ - /* * Incoming packets are placed on per-CPU queues */ @@ -3770,7 +3747,6 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); void __dev_notify_flags(struct net_device *, unsigned int old_flags, unsigned int gchanges); -int dev_change_name(struct net_device *, const char *); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); int __dev_change_net_namespace(struct net_device *dev, struct net *net, @@ -3782,13 +3758,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, return __dev_change_net_namespace(dev, net, pat, 0); } int __dev_set_mtu(struct net_device *, int); -int dev_validate_mtu(struct net_device *dev, int mtu, - struct netlink_ext_ack *extack); -int dev_set_mtu_ext(struct net_device *dev, int mtu, - struct netlink_ext_ack *extack); int dev_set_mtu(struct net_device *, int); -int dev_change_tx_queue_len(struct net_device *, unsigned long); -void dev_set_group(struct net_device *, int); int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, @@ -3796,24 +3766,13 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name); -int dev_change_carrier(struct net_device *, bool new_carrier); -int dev_get_phys_port_id(struct net_device *dev, - struct netdev_phys_item_id *ppid); -int dev_get_phys_port_name(struct net_device *dev, - char *name, size_t len); int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse); bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); -int dev_change_proto_down(struct net_device *dev, bool proto_down); -void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, - u32 value); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); -typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); -int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, - int fd, int expected_fd, u32 flags); int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); u8 dev_xdp_prog_count(struct net_device *dev); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); @@ -3898,13 +3857,6 @@ static __always_inline int ____dev_forward_skb(struct net_device *dev, bool dev_nit_active(struct net_device *dev); void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); -extern int netdev_budget; -extern unsigned int netdev_budget_usecs; - -/* Used by rtnetlink.c:__rtnl_unlock()/rtnl_unlock() */ -extern struct list_head net_todo_list; -void netdev_run_todo(void); - static inline void __dev_put(struct net_device *dev) { if (dev) { @@ -4021,10 +3973,7 @@ static inline void dev_replace_track(struct net_device *odev, * called netif_lowerlayer_*() because they represent the state of any * kind of lower layer not just hardware media. */ - -void linkwatch_init_dev(struct net_device *dev); void linkwatch_fire_event(struct net_device *dev); -void linkwatch_forget_dev(struct net_device *dev); /** * netif_carrier_ok - test if carrier present @@ -4470,9 +4419,6 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); int dev_addr_del(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); -void dev_addr_flush(struct net_device *dev); -int dev_addr_init(struct net_device *dev); -void dev_addr_check(struct net_device *dev); /* Functions used for unicast addresses handling */ int dev_uc_add(struct net_device *dev, const unsigned char *addr); @@ -4562,7 +4508,6 @@ static inline void __dev_mc_unsync(struct net_device *dev, /* Functions used for secondary unicast and multicast support */ void dev_set_rx_mode(struct net_device *dev); -void __dev_set_rx_mode(struct net_device *dev); int dev_set_promiscuity(struct net_device *dev, int inc); int dev_set_allmulti(struct net_device *dev, int inc); void netdev_state_change(struct net_device *dev); @@ -4580,11 +4525,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); extern int netdev_max_backlog; -extern int netdev_tstamp_prequeue; -extern int netdev_unregister_timeout_secs; -extern int weight_p; -extern int dev_weight_rx_bias; -extern int dev_weight_tx_bias; extern int dev_rx_weight; extern int dev_tx_weight; extern int gro_normal_batch; @@ -4772,12 +4712,6 @@ static inline void netdev_rx_csum_fault(struct net_device *dev, void net_enable_timestamp(void); void net_disable_timestamp(void); -#ifdef CONFIG_PROC_FS -int __init dev_proc_init(void); -#else -#define dev_proc_init() 0 -#endif - static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, struct sk_buff *skb, struct net_device *dev, bool more) @@ -4813,8 +4747,6 @@ extern const struct kobj_ns_type_operations net_ns_type_operations; const char *netdev_drivername(const struct net_device *dev); -void linkwatch_run_queue(void); - static inline netdev_features_t netdev_intersect_features(netdev_features_t f1, netdev_features_t f2) { -- cgit v1.2.3 From 794c24e9921f32ded4422833a990ccf11dc3c00e Mon Sep 17 00:00:00 2001 From: Jeffrey Ji Date: Wed, 6 Apr 2022 17:26:00 +0000 Subject: net-core: rx_otherhost_dropped to core_stats Increment rx_otherhost_dropped counter when packet dropped due to mismatched dest MAC addr. An example when this drop can occur is when manually crafting raw packets that will be consumed by a user space application via a tap device. For testing purposes local traffic was generated using trafgen for the client and netcat to start a server Tested: Created 2 netns, sent 1 packet using trafgen from 1 to the other with "{eth(daddr=$INCORRECT_MAC...}", verified that iproute2 showed the counter was incremented. (Also had to modify iproute2 to show the stat, additional patch for that coming next.) Signed-off-by: Jeffrey Ji Reviewed-by: Brian Vazquez Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220406172600.1141083-1-jeffreyjilinux@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7e7b2a72e473..28ea4f8269d4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -203,6 +203,7 @@ struct net_device_core_stats { local_t rx_dropped; local_t tx_dropped; local_t rx_nohandler; + local_t rx_otherhost_dropped; } __aligned(4 * sizeof(local_t)); #include @@ -3837,6 +3838,7 @@ static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev) \ DEV_CORE_STATS_INC(rx_dropped) DEV_CORE_STATS_INC(tx_dropped) DEV_CORE_STATS_INC(rx_nohandler) +DEV_CORE_STATS_INC(rx_otherhost_dropped) static __always_inline int ____dev_forward_skb(struct net_device *dev, struct sk_buff *skb, -- cgit v1.2.3 From 2fa33b3518a8da0a5345b7ae0064223b5e4e156f Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 6 Apr 2022 11:25:36 +0300 Subject: net/mlx5_fpga: Drop INNOVA IPsec support Mellanox INNOVA IPsec cards are EOL in Nov, 2019 [1]. As such, the code is unmaintained, untested and not in-use by any upstream/distro oriented customers. In order to reduce code complexity, drop the kernel code. [1] https://network.nvidia.com/related-docs/eol/LCR-000535.pdf Link: https://lore.kernel.org/r/2afe88ec5020a491079eacf6fe3c89b64d65195c.1649232994.git.leonro@nvidia.com Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc_fpga.h | 148 ------------------------------------- 1 file changed, 148 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h index e3d824f6a309..45c7c0d67635 100644 --- a/include/linux/mlx5/mlx5_ifc_fpga.h +++ b/include/linux/mlx5/mlx5_ifc_fpga.h @@ -386,68 +386,6 @@ struct mlx5_ifc_fpga_destroy_qp_out_bits { u8 reserved_at_40[0x40]; }; -struct mlx5_ifc_ipsec_extended_cap_bits { - u8 encapsulation[0x20]; - - u8 reserved_0[0x12]; - u8 v2_command[0x1]; - u8 udp_encap[0x1]; - u8 rx_no_trailer[0x1]; - u8 ipv4_fragment[0x1]; - u8 ipv6[0x1]; - u8 esn[0x1]; - u8 lso[0x1]; - u8 transport_and_tunnel_mode[0x1]; - u8 tunnel_mode[0x1]; - u8 transport_mode[0x1]; - u8 ah_esp[0x1]; - u8 esp[0x1]; - u8 ah[0x1]; - u8 ipv4_options[0x1]; - - u8 auth_alg[0x20]; - - u8 enc_alg[0x20]; - - u8 sa_cap[0x20]; - - u8 reserved_1[0x10]; - u8 number_of_ipsec_counters[0x10]; - - u8 ipsec_counters_addr_low[0x20]; - u8 ipsec_counters_addr_high[0x20]; -}; - -struct mlx5_ifc_ipsec_counters_bits { - u8 dec_in_packets[0x40]; - - u8 dec_out_packets[0x40]; - - u8 dec_bypass_packets[0x40]; - - u8 enc_in_packets[0x40]; - - u8 enc_out_packets[0x40]; - - u8 enc_bypass_packets[0x40]; - - u8 drop_dec_packets[0x40]; - - u8 failed_auth_dec_packets[0x40]; - - u8 drop_enc_packets[0x40]; - - u8 success_add_sa[0x40]; - - u8 fail_add_sa[0x40]; - - u8 success_delete_sa[0x40]; - - u8 fail_delete_sa[0x40]; - - u8 dropped_cmd[0x40]; -}; - enum { MLX5_FPGA_QP_ERROR_EVENT_SYNDROME_RETRY_COUNTER_EXPIRED = 0x1, MLX5_FPGA_QP_ERROR_EVENT_SYNDROME_RNR_EXPIRED = 0x2, @@ -464,90 +402,4 @@ struct mlx5_ifc_fpga_qp_error_event_bits { u8 reserved_at_c0[0x8]; u8 fpga_qpn[0x18]; }; -enum mlx5_ifc_fpga_ipsec_response_syndrome { - MLX5_FPGA_IPSEC_RESPONSE_SUCCESS = 0, - MLX5_FPGA_IPSEC_RESPONSE_ILLEGAL_REQUEST = 1, - MLX5_FPGA_IPSEC_RESPONSE_SADB_ISSUE = 2, - MLX5_FPGA_IPSEC_RESPONSE_WRITE_RESPONSE_ISSUE = 3, -}; - -struct mlx5_ifc_fpga_ipsec_cmd_resp { - __be32 syndrome; - union { - __be32 sw_sa_handle; - __be32 flags; - }; - u8 reserved[24]; -} __packed; - -enum mlx5_ifc_fpga_ipsec_cmd_opcode { - MLX5_FPGA_IPSEC_CMD_OP_ADD_SA = 0, - MLX5_FPGA_IPSEC_CMD_OP_DEL_SA = 1, - MLX5_FPGA_IPSEC_CMD_OP_ADD_SA_V2 = 2, - MLX5_FPGA_IPSEC_CMD_OP_DEL_SA_V2 = 3, - MLX5_FPGA_IPSEC_CMD_OP_MOD_SA_V2 = 4, - MLX5_FPGA_IPSEC_CMD_OP_SET_CAP = 5, -}; - -enum mlx5_ifc_fpga_ipsec_cap { - MLX5_FPGA_IPSEC_CAP_NO_TRAILER = BIT(0), -}; - -struct mlx5_ifc_fpga_ipsec_cmd_cap { - __be32 cmd; - __be32 flags; - u8 reserved[24]; -} __packed; - -enum mlx5_ifc_fpga_ipsec_sa_flags { - MLX5_FPGA_IPSEC_SA_ESN_EN = BIT(0), - MLX5_FPGA_IPSEC_SA_ESN_OVERLAP = BIT(1), - MLX5_FPGA_IPSEC_SA_IPV6 = BIT(2), - MLX5_FPGA_IPSEC_SA_DIR_SX = BIT(3), - MLX5_FPGA_IPSEC_SA_SPI_EN = BIT(4), - MLX5_FPGA_IPSEC_SA_SA_VALID = BIT(5), - MLX5_FPGA_IPSEC_SA_IP_ESP = BIT(6), - MLX5_FPGA_IPSEC_SA_IP_AH = BIT(7), -}; - -enum mlx5_ifc_fpga_ipsec_sa_enc_mode { - MLX5_FPGA_IPSEC_SA_ENC_MODE_NONE = 0, - MLX5_FPGA_IPSEC_SA_ENC_MODE_AES_GCM_128_AUTH_128 = 1, - MLX5_FPGA_IPSEC_SA_ENC_MODE_AES_GCM_256_AUTH_128 = 3, -}; - -struct mlx5_ifc_fpga_ipsec_sa_v1 { - __be32 cmd; - u8 key_enc[32]; - u8 key_auth[32]; - __be32 sip[4]; - __be32 dip[4]; - union { - struct { - __be32 reserved; - u8 salt_iv[8]; - __be32 salt; - } __packed gcm; - struct { - u8 salt[16]; - } __packed cbc; - }; - __be32 spi; - __be32 sw_sa_handle; - __be16 tfclen; - u8 enc_mode; - u8 reserved1[2]; - u8 flags; - u8 reserved2[2]; -}; - -struct mlx5_ifc_fpga_ipsec_sa { - struct mlx5_ifc_fpga_ipsec_sa_v1 ipsec_sa_v1; - __be16 udp_sp; - __be16 udp_dp; - u8 reserved1[4]; - __be32 esn; - __be16 vid; /* only 12 bits, rest is reserved */ - __be16 reserved2; -} __packed; #endif /* MLX5_IFC_FPGA_H */ -- cgit v1.2.3 From de8bdb476908e64805df4bfbad20618cbb1f9ffa Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 6 Apr 2022 11:25:42 +0300 Subject: RDMA/mlx5: Drop crypto flow steering API The mlx5 flow steering crypto API was intended to be used in FPGA devices, which is not supported for years already. The removal of mlx5 crypto FPGA code together with inability to configure encryption keys makes the low steering API completely unusable. So delete the code, so any ESP flow steering requests will fail with not supported error, as it is happening now anyway as no device support this type of API. Link: https://lore.kernel.org/r/634a5face7734381463d809bfb89850f6998deac.1649232994.git.leonro@nvidia.com Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky --- include/linux/mlx5/accel.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index dacf69516002..af67d51308cf 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -111,10 +111,6 @@ struct mlx5_accel_esp_xfrm { struct mlx5_accel_esp_xfrm_attrs attrs; }; -enum { - MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA = 1UL << 0, -}; - enum mlx5_accel_ipsec_cap { MLX5_ACCEL_IPSEC_CAP_DEVICE = 1 << 0, MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA = 1 << 1, @@ -132,8 +128,7 @@ u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev); struct mlx5_accel_esp_xfrm * mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, - const struct mlx5_accel_esp_xfrm_attrs *attrs, - u32 flags); + const struct mlx5_accel_esp_xfrm_attrs *attrs); void mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm); int mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, const struct mlx5_accel_esp_xfrm_attrs *attrs); @@ -144,8 +139,10 @@ static inline u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev) { ret static inline struct mlx5_accel_esp_xfrm * mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, - const struct mlx5_accel_esp_xfrm_attrs *attrs, - u32 flags) { return ERR_PTR(-EOPNOTSUPP); } + const struct mlx5_accel_esp_xfrm_attrs *attrs) +{ + return ERR_PTR(-EOPNOTSUPP); +} static inline void mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm) {} static inline int -- cgit v1.2.3 From 2451da081a343e079d9f5a7b063fcdf0bc439aa8 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 6 Apr 2022 11:25:46 +0300 Subject: net/mlx5: Unify device IPsec capabilities check Merge two different function to one in order to provide coherent picture if the device is IPsec capable or not. Link: https://lore.kernel.org/r/8f10ea06ad19c6f651e9fb33921009658f01e1d5.1649232994.git.leonro@nvidia.com Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky --- include/linux/mlx5/accel.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index af67d51308cf..9145e2d37c0e 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -124,7 +124,7 @@ enum mlx5_accel_ipsec_cap { #ifdef CONFIG_MLX5_ACCEL -u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev); +u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev); struct mlx5_accel_esp_xfrm * mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, @@ -135,7 +135,10 @@ int mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, #else -static inline u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev) { return 0; } +static inline u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev) +{ + return 0; +} static inline struct mlx5_accel_esp_xfrm * mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, -- cgit v1.2.3 From 54deb0e77561973f4ca4515e18ab972c281eea1d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 6 Apr 2022 11:25:48 +0300 Subject: net/mlx5: Remove not-needed IPsec config In current code, the CONFIG_MLX5_IPSEC and CONFIG_MLX5_EN_IPSEC are the same. So remove useless indirection. Link: https://lore.kernel.org/r/fd14492cbc01a0d51a5bfedde02bcd2154123fde.1649232994.git.leonro@nvidia.com Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky --- include/linux/mlx5/accel.h | 4 ++-- include/linux/mlx5/driver.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index 9145e2d37c0e..73e4d50a9f02 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -122,7 +122,7 @@ enum mlx5_accel_ipsec_cap { MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN = 1 << 7, }; -#ifdef CONFIG_MLX5_ACCEL +#ifdef CONFIG_MLX5_EN_IPSEC u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev); @@ -152,5 +152,5 @@ static inline int mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, const struct mlx5_accel_esp_xfrm_attrs *attrs) { return -EOPNOTSUPP; } -#endif /* CONFIG_MLX5_ACCEL */ +#endif /* CONFIG_MLX5_EN_IPSEC */ #endif /* __MLX5_ACCEL_H__ */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 9424503eb8d3..5af53c035949 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -778,7 +778,7 @@ struct mlx5_core_dev { #ifdef CONFIG_MLX5_FPGA struct mlx5_fpga_device *fpga; #endif -#ifdef CONFIG_MLX5_ACCEL +#ifdef CONFIG_MLX5_EN_IPSEC const struct mlx5_accel_ipsec_ops *ipsec_ops; #endif struct mlx5_clock clock; -- cgit v1.2.3 From f2b41b32cde8453a0a26875261f0e26809c2805a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 6 Apr 2022 11:25:51 +0300 Subject: net/mlx5: Remove ipsec_ops function table There is only one IPsec implementation and ipsec_ops is not needed at all in this situation. Together with removal of ipsec_ops, we can drop the entry checks as these functions are called for IPsec devices only. Link: https://lore.kernel.org/r/bc8dd1c8a77b65dbf5e2cf92c813ffaca2505c5f.1649232994.git.leonro@nvidia.com Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5af53c035949..ff47d49d8be4 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -777,9 +777,6 @@ struct mlx5_core_dev { } roce; #ifdef CONFIG_MLX5_FPGA struct mlx5_fpga_device *fpga; -#endif -#ifdef CONFIG_MLX5_EN_IPSEC - const struct mlx5_accel_ipsec_ops *ipsec_ops; #endif struct mlx5_clock clock; struct mlx5_ib_clock_info *clock_info; -- cgit v1.2.3 From 2984287c4c19949d7eb451dcad0bd5c54a2a376f Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 6 Apr 2022 11:25:52 +0300 Subject: net/mlx5: Remove not-implemented IPsec capabilities Clean a capabilities enum to remove not-implemented bits. Link: https://lore.kernel.org/r/1044bb7b779107ff38e48e3f6553421104f3f819.1649232994.git.leonro@nvidia.com Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky --- include/linux/mlx5/accel.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index 73e4d50a9f02..0f2596297f6a 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -113,13 +113,10 @@ struct mlx5_accel_esp_xfrm { enum mlx5_accel_ipsec_cap { MLX5_ACCEL_IPSEC_CAP_DEVICE = 1 << 0, - MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA = 1 << 1, - MLX5_ACCEL_IPSEC_CAP_ESP = 1 << 2, - MLX5_ACCEL_IPSEC_CAP_IPV6 = 1 << 3, - MLX5_ACCEL_IPSEC_CAP_LSO = 1 << 4, - MLX5_ACCEL_IPSEC_CAP_RX_NO_TRAILER = 1 << 5, - MLX5_ACCEL_IPSEC_CAP_ESN = 1 << 6, - MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN = 1 << 7, + MLX5_ACCEL_IPSEC_CAP_ESP = 1 << 1, + MLX5_ACCEL_IPSEC_CAP_IPV6 = 1 << 2, + MLX5_ACCEL_IPSEC_CAP_LSO = 1 << 3, + MLX5_ACCEL_IPSEC_CAP_ESN = 1 << 4, }; #ifdef CONFIG_MLX5_EN_IPSEC -- cgit v1.2.3 From 9f8ed577c28813410614b418bad42285840c1a00 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Apr 2022 14:20:50 +0800 Subject: net: skb: rename SKB_DROP_REASON_PTYPE_ABSENT As David Ahern suggested, the reasons for skb drops should be more general and not be code based. Therefore, rename SKB_DROP_REASON_PTYPE_ABSENT to SKB_DROP_REASON_UNHANDLED_PROTO, which is used for the cases of no L3 protocol handler, no L4 protocol handler, version extensions, etc. From previous discussion, now we have the aim to make these reasons more abstract and users based, avoiding code based. Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2394441fa3dd..173bc35a10a3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -408,11 +408,9 @@ enum skb_drop_reason { */ SKB_DROP_REASON_XDP, /* dropped by XDP in input path */ SKB_DROP_REASON_TC_INGRESS, /* dropped in TC ingress HOOK */ - SKB_DROP_REASON_PTYPE_ABSENT, /* not packet_type found to handle - * the skb. For an etner packet, - * this means that L3 protocol is - * not supported - */ + SKB_DROP_REASON_UNHANDLED_PROTO, /* protocol not implemented + * or not supported + */ SKB_DROP_REASON_SKB_CSUM, /* sk_buff checksum computation * error */ -- cgit v1.2.3 From b384c95a861eebf47e88695cf6a29f34e0b10b0f Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Apr 2022 14:20:52 +0800 Subject: net: icmp: add skb drop reasons to icmp protocol Replace kfree_skb() used in icmp_rcv() and icmpv6_rcv() with kfree_skb_reason(). In order to get the reasons of the skb drops after icmp message handle, we change the return type of 'handler()' in 'struct icmp_control' from 'bool' to 'enum skb_drop_reason'. This may change its original intention, as 'false' means failure, but 'SKB_NOT_DROPPED_YET' means success now. Therefore, all 'handler' and the call of them need to be handled. Following 'handler' functions are involved: icmp_unreach() icmp_redirect() icmp_echo() icmp_timestamp() icmp_discard() And following new drop reasons are added: SKB_DROP_REASON_ICMP_CSUM SKB_DROP_REASON_INVALID_PROTO The reason 'INVALID_PROTO' is introduced for the case that the packet doesn't follow rfc 1122 and is dropped. This is not a common case, and I believe we can locate the problem from the data in the packet. For now, this 'INVALID_PROTO' is used for the icmp broadcasts with wrong types. Maybe there should be a document file for these reasons. For example, list all the case that causes the 'UNHANDLED_PROTO' and 'INVALID_PROTO' drop reason. Therefore, users can locate their problems according to the document. Reviewed-by: Hao Peng Reviewed-by: Jiang Biao Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 173bc35a10a3..9b81ba497665 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -442,6 +442,11 @@ enum skb_drop_reason { SKB_DROP_REASON_TAP_TXFILTER, /* dropped by tx filter implemented * at tun/tap, e.g., check_filter() */ + SKB_DROP_REASON_ICMP_CSUM, /* ICMP checksum error */ + SKB_DROP_REASON_INVALID_PROTO, /* the packet doesn't follow RFC + * 2211, such as a broadcasts + * ICMP_TIMESTAMP + */ SKB_DROP_REASON_MAX, }; -- cgit v1.2.3 From 1a95e04e29a116c3424988c70c441ca8ec2779ff Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 12 Apr 2022 11:24:00 +0100 Subject: net: phylink: remove phylink_helper_basex_speed() As there are now no users of phylink_helper_basex_speed(), we can remove this obsolete functionality. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phylink.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 223781622b33..6d06896fc20d 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -160,11 +160,6 @@ struct phylink_mac_ops { * clearing unsupported speeds and duplex settings. The port modes * should not be cleared; phylink_set_port_modes() will help with this. * - * If the @state->interface mode is %PHY_INTERFACE_MODE_1000BASEX - * or %PHY_INTERFACE_MODE_2500BASEX, select the appropriate mode - * based on @state->advertising and/or @state->speed and update - * @state->interface accordingly. See phylink_helper_basex_speed(). - * * When @config->supported_interfaces has been set, phylink will iterate * over the supported interfaces to determine the full capability of the * MAC. The validation function must not print errors if @state->interface @@ -579,7 +574,6 @@ int phylink_speed_up(struct phylink *pl); #define phylink_test(bm, mode) __phylink_do_bit(test_bit, bm, mode) void phylink_set_port_modes(unsigned long *bits); -void phylink_helper_basex_speed(struct phylink_link_state *state); void phylink_mii_c22_pcs_decode_state(struct phylink_link_state *state, u16 bmsr, u16 lpa); -- cgit v1.2.3 From 1306d5362a591493a2d07f685ed2cc480dcda320 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 13 Apr 2022 13:51:56 +0300 Subject: net: add ndo_fdb_del_bulk Add a new netdev op called ndo_fdb_del_bulk, it will be later used for driver-specific bulk delete implementation dispatched from rtnetlink. The first user will be the bridge, we need it to signal to rtnetlink from the driver that we support bulk delete operation (NLM_F_BULK). Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 28ea4f8269d4..a602f29365b0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1260,6 +1260,10 @@ struct netdev_net_notifier { * struct net_device *dev, * const unsigned char *addr, u16 vid) * Deletes the FDB entry from dev coresponding to addr. + * int (*ndo_fdb_del_bulk)(struct ndmsg *ndm, struct nlattr *tb[], + * struct net_device *dev, + * u16 vid, + * struct netlink_ext_ack *extack); * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, * struct net_device *dev, struct net_device *filter_dev, * int *idx) @@ -1510,6 +1514,11 @@ struct net_device_ops { struct net_device *dev, const unsigned char *addr, u16 vid); + int (*ndo_fdb_del_bulk)(struct ndmsg *ndm, + struct nlattr *tb[], + struct net_device *dev, + u16 vid, + struct netlink_ext_ack *extack); int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, -- cgit v1.2.3 From d6d3146ce532268ad0ffd8d92d2b7492898decf1 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 13 Apr 2022 16:15:52 +0800 Subject: skb: add some helpers for skb drop reasons In order to simply the definition and assignment for 'enum skb_drop_reason', introduce some helpers. SKB_DR() is used to define a variable of type 'enum skb_drop_reason' with the 'SKB_DROP_REASON_NOT_SPECIFIED' initial value. SKB_DR_SET() is used to set the value of the variable. Seems it is a little useless? But it makes the code shorter. SKB_DR_OR() is used to set the value of the variable if it is not set yet, which means its value is SKB_DROP_REASON_NOT_SPECIFIED. Signed-off-by: Menglong Dong Reviewed-by: Jiang Biao Reviewed-by: Hao Peng Signed-off-by: David S. Miller --- include/linux/skbuff.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9b81ba497665..0cbd6ada957c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -450,6 +450,18 @@ enum skb_drop_reason { SKB_DROP_REASON_MAX, }; +#define SKB_DR_INIT(name, reason) \ + enum skb_drop_reason name = SKB_DROP_REASON_##reason +#define SKB_DR(name) \ + SKB_DR_INIT(name, NOT_SPECIFIED) +#define SKB_DR_SET(name, reason) \ + (name = SKB_DROP_REASON_##reason) +#define SKB_DR_OR(name, reason) \ + do { \ + if (name == SKB_DROP_REASON_NOT_SPECIFIED) \ + SKB_DR_SET(name, reason); \ + } while (0) + /* To allow 64K frame to be packed as single skb without frag_list we * require 64K/PAGE_SIZE pages plus 1 additional page to allow for * buffers which do not start on a page boundary. -- cgit v1.2.3 From c4eb664191b4a5ff6856478f903924176697719e Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 13 Apr 2022 16:15:53 +0800 Subject: net: ipv4: add skb drop reasons to ip_error() Eventually, I find out the handler function for inputting route lookup fail: ip_error(). The drop reasons we used in ip_error() are almost corresponding to IPSTATS_MIB_*, and following new reasons are introduced: SKB_DROP_REASON_IP_INADDRERRORS SKB_DROP_REASON_IP_INNOROUTES Isn't the name SKB_DROP_REASON_IP_HOSTUNREACH and SKB_DROP_REASON_IP_NETUNREACH more accurate? To make them corresponding to IPSTATS_MIB_*, we keep their name still. Signed-off-by: Menglong Dong Reviewed-by: Jiang Biao Reviewed-by: Hao Peng Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0cbd6ada957c..886e83ac4b70 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -447,6 +447,12 @@ enum skb_drop_reason { * 2211, such as a broadcasts * ICMP_TIMESTAMP */ + SKB_DROP_REASON_IP_INADDRERRORS, /* host unreachable, corresponding + * to IPSTATS_MIB_INADDRERRORS + */ + SKB_DROP_REASON_IP_INNOROUTES, /* network unreachable, corresponding + * to IPSTATS_MIB_INADDRERRORS + */ SKB_DROP_REASON_MAX, }; -- cgit v1.2.3 From 2edc1a383fda8d2f580216292dfd9daeae691e47 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 13 Apr 2022 16:15:55 +0800 Subject: net: ip: add skb drop reasons to ip forwarding Replace kfree_skb() which is used in ip6_forward() and ip_forward() with kfree_skb_reason(). The new drop reason 'SKB_DROP_REASON_PKT_TOO_BIG' is introduced for the case that the length of the packet exceeds MTU and can't fragment. Signed-off-by: Menglong Dong Reviewed-by: Jiang Biao Reviewed-by: Hao Peng Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 886e83ac4b70..0ef11df1bc67 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -453,6 +453,9 @@ enum skb_drop_reason { SKB_DROP_REASON_IP_INNOROUTES, /* network unreachable, corresponding * to IPSTATS_MIB_INADDRERRORS */ + SKB_DROP_REASON_PKT_TOO_BIG, /* packet size is too big (maybe exceed + * the MTU) + */ SKB_DROP_REASON_MAX, }; -- cgit v1.2.3 From 1ad6d548e2a452f21bcee4606ee4ec7afcde5f37 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 13 Apr 2022 16:15:56 +0800 Subject: net: icmp: introduce function icmpv6_param_prob_reason() In order to add the skb drop reasons support to icmpv6_param_prob(), introduce the function icmpv6_param_prob_reason() and make icmpv6_param_prob() an inline call to it. This new function will be used in the following patches. Signed-off-by: Menglong Dong Reviewed-by: Jiang Biao Reviewed-by: Hao Peng Signed-off-by: David S. Miller --- include/linux/icmpv6.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h index 9055cb380ee2..db0f4fcfdaf4 100644 --- a/include/linux/icmpv6.h +++ b/include/linux/icmpv6.h @@ -79,8 +79,9 @@ extern int icmpv6_init(void); extern int icmpv6_err_convert(u8 type, u8 code, int *err); extern void icmpv6_cleanup(void); -extern void icmpv6_param_prob(struct sk_buff *skb, - u8 code, int pos); +extern void icmpv6_param_prob_reason(struct sk_buff *skb, + u8 code, int pos, + enum skb_drop_reason reason); struct flowi6; struct in6_addr; @@ -91,6 +92,12 @@ extern void icmpv6_flow_init(struct sock *sk, const struct in6_addr *daddr, int oif); +static inline void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) +{ + icmpv6_param_prob_reason(skb, code, pos, + SKB_DROP_REASON_NOT_SPECIFIED); +} + static inline bool icmpv6_is_err(int type) { switch (type) { -- cgit v1.2.3 From 64b97df995f0c943be469a019d6117c89e2131bc Mon Sep 17 00:00:00 2001 From: Lech Perczak Date: Wed, 13 Apr 2022 03:44:14 +0200 Subject: cdc_ether: export usbnet_cdc_zte_rx_fixup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit bfe9b9d2df66 ("cdc_ether: Improve ZTE MF823/831/910 handling") introduces a workaround for certain ZTE modems reporting invalid MAC addresses over CDC-ECM. The same issue was present on their RNDIS interface,which was fixed in commit a5a18bdf7453 ("rndis_host: Set valid random MAC on buggy devices"). However, internal modem of ZTE MF286R router, on its RNDIS interface, also exhibits a second issue fixed already in CDC-ECM, of the device not respecting configured random MAC address. In order to share the fixup for this with rndis_host driver, export the workaround function, which will be re-used in the following commit in rndis_host. Cc: Kristian Evensen Cc: Bjørn Mork Cc: Oliver Neukum Signed-off-by: Lech Perczak Signed-off-by: Paolo Abeni --- include/linux/usb/usbnet.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index 8336e86ce606..1b4d72d5e891 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -214,6 +214,7 @@ extern int usbnet_ether_cdc_bind(struct usbnet *dev, struct usb_interface *intf) extern int usbnet_cdc_bind(struct usbnet *, struct usb_interface *); extern void usbnet_cdc_unbind(struct usbnet *, struct usb_interface *); extern void usbnet_cdc_status(struct usbnet *, struct urb *); +extern int usbnet_cdc_zte_rx_fixup(struct usbnet *dev, struct sk_buff *skb); /* CDC and RNDIS support the same host-chosen packet filters for IN transfers */ #define DEFAULT_FILTER (USB_CDC_PACKET_TYPE_BROADCAST \ -- cgit v1.2.3 From 36e747972d8b4c09e6e3275e31a3acba46e2c4d2 Mon Sep 17 00:00:00 2001 From: Lech Perczak Date: Wed, 13 Apr 2022 03:44:15 +0200 Subject: rndis_host: enable the bogus MAC fixup for ZTE devices from cdc_ether MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Certain ZTE modems, namely: MF823. MF831, MF910, built-in modem from MF286R, expose both CDC-ECM and RNDIS network interfaces. They have a trait of ignoring the locally-administered MAC address configured on the interface both in CDC-ECM and RNDIS part, and this leads to dropping of incoming traffic by the host. However, the workaround was only present in CDC-ECM, and MF286R explicitly requires it in RNDIS mode. Re-use the workaround in rndis_host as well, to fix operation of MF286R module, some versions of which expose only the RNDIS interface. Do so by introducing new flag, RNDIS_DRIVER_DATA_DST_MAC_FIXUP, and testing for it in rndis_rx_fixup. This is required, as RNDIS uses frame batching, and all of the packets inside the batch need the fixup. This might introduce a performance penalty, because test is done for every returned Ethernet frame. Apply the workaround to both "flavors" of RNDIS interfaces, as older ZTE modems, like MF823 found in the wild, report the USB_CLASS_COMM class interfaces, while MF286R reports USB_CLASS_WIRELESS_CONTROLLER. Suggested-by: Bjørn Mork Cc: Kristian Evensen Cc: Oliver Neukum Signed-off-by: Lech Perczak Signed-off-by: Paolo Abeni --- include/linux/usb/rndis_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/rndis_host.h b/include/linux/usb/rndis_host.h index 809bccd08455..cc42db51bbba 100644 --- a/include/linux/usb/rndis_host.h +++ b/include/linux/usb/rndis_host.h @@ -197,6 +197,7 @@ struct rndis_keepalive_c { /* IN (optionally OUT) */ /* Flags for driver_info::data */ #define RNDIS_DRIVER_DATA_POLL_STATUS 1 /* poll status before control */ +#define RNDIS_DRIVER_DATA_DST_MAC_FIXUP 2 /* device ignores configured MAC address */ extern void rndis_status(struct usbnet *dev, struct urb *urb); extern int -- cgit v1.2.3 From 4dc84c06a343fcb95fd5a0acb537aefa4ebdd1b0 Mon Sep 17 00:00:00 2001 From: Jie Wang Date: Tue, 12 Apr 2022 10:01:19 +0800 Subject: net: ethtool: extend ringparam set/get APIs for tx_push Currently tx push is a standard driver feature which controls use of a fast path descriptor push. So this patch extends the ringparam APIs and data structures to support set/get tx push by ethtool -G/g. Signed-off-by: Jie Wang Signed-off-by: Guangbin Huang Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 4af58459a1e7..99dc7bfbcd3c 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -71,11 +71,13 @@ enum { * struct kernel_ethtool_ringparam - RX/TX ring configuration * @rx_buf_len: Current length of buffers on the rx ring. * @tcp_data_split: Scatter packet headers and data to separate buffers + * @tx_push: The flag of tx push mode * @cqe_size: Size of TX/RX completion queue event */ struct kernel_ethtool_ringparam { u32 rx_buf_len; u8 tcp_data_split; + u8 tx_push; u32 cqe_size; }; @@ -83,10 +85,12 @@ struct kernel_ethtool_ringparam { * enum ethtool_supported_ring_param - indicator caps for setting ring params * @ETHTOOL_RING_USE_RX_BUF_LEN: capture for setting rx_buf_len * @ETHTOOL_RING_USE_CQE_SIZE: capture for setting cqe_size + * @ETHTOOL_RING_USE_TX_PUSH: capture for setting tx_push */ enum ethtool_supported_ring_param { ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0), ETHTOOL_RING_USE_CQE_SIZE = BIT(1), + ETHTOOL_RING_USE_TX_PUSH = BIT(2), }; #define __ETH_RSS_HASH_BIT(bit) ((u32)1 << (bit)) -- cgit v1.2.3 From f9a2fb73318eb4dbf8cd84866b8b0dd012d8b116 Mon Sep 17 00:00:00 2001 From: Arun Ajith S Date: Fri, 15 Apr 2022 08:34:02 +0000 Subject: net/ipv6: Introduce accept_unsolicited_na knob to implement router-side changes for RFC9131 Add a new neighbour cache entry in STALE state for routers on receiving an unsolicited (gratuitous) neighbour advertisement with target link-layer-address option specified. This is similar to the arp_accept configuration for IPv4. A new sysctl endpoint is created to turn on this behaviour: /proc/sys/net/ipv6/conf/interface/accept_unsolicited_na. Signed-off-by: Arun Ajith S Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 16870f86c74d..918bfea4ef5f 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -61,6 +61,7 @@ struct ipv6_devconf { __s32 suppress_frag_ndisc; __s32 accept_ra_mtu; __s32 drop_unsolicited_na; + __s32 accept_unsolicited_na; struct ipv6_stable_secret { bool initialized; struct in6_addr secret; -- cgit v1.2.3 From da40b613f89c43c58986e6f30560ad6573a4d569 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Apr 2022 17:10:41 -0700 Subject: tcp: add drop reason support to tcp_validate_incoming() Creates four new drop reasons for the following cases: 1) packet being rejected by RFC 7323 PAWS check 2) packet being rejected by SEQUENCE check 3) Invalid RST packet 4) Invalid SYN packet Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0ef11df1bc67..a903da1fa0ed 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -381,6 +381,12 @@ enum skb_drop_reason { * the ofo queue, corresponding to * LINUX_MIB_TCPOFOMERGE */ + SKB_DROP_REASON_TCP_RFC7323_PAWS, /* PAWS check, corresponding to + * LINUX_MIB_PAWSESTABREJECTED + */ + SKB_DROP_REASON_TCP_INVALID_SEQUENCE, /* Not acceptable SEQ field */ + SKB_DROP_REASON_TCP_RESET, /* Invalid RST packet */ + SKB_DROP_REASON_TCP_INVALID_SYN, /* Incoming packet has unexpected SYN flag */ SKB_DROP_REASON_IP_OUTNOROUTES, /* route lookup failed */ SKB_DROP_REASON_BPF_CGROUP_EGRESS, /* dropped by * BPF_PROG_TYPE_CGROUP_SKB -- cgit v1.2.3 From 669da7a71890b2b2a31a7e9571c0fdf1123e26ef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Apr 2022 17:10:43 -0700 Subject: tcp: add drop reasons to tcp_rcv_state_process() Add basic support for drop reasons in tcp_rcv_state_process() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a903da1fa0ed..6f1410b5ff13 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -387,6 +387,9 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_INVALID_SEQUENCE, /* Not acceptable SEQ field */ SKB_DROP_REASON_TCP_RESET, /* Invalid RST packet */ SKB_DROP_REASON_TCP_INVALID_SYN, /* Incoming packet has unexpected SYN flag */ + SKB_DROP_REASON_TCP_CLOSE, /* TCP socket in CLOSE state */ + SKB_DROP_REASON_TCP_FASTOPEN, /* dropped by FASTOPEN request socket */ + SKB_DROP_REASON_TCP_OLD_ACK, /* TCP ACK is old, but in window */ SKB_DROP_REASON_IP_OUTNOROUTES, /* route lookup failed */ SKB_DROP_REASON_BPF_CGROUP_EGRESS, /* dropped by * BPF_PROG_TYPE_CGROUP_SKB -- cgit v1.2.3 From 4b506af9c5b8de0da34097d50d9448dfb33d70c3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Apr 2022 17:10:44 -0700 Subject: tcp: add two drop reasons for tcp_ack() Add TCP_TOO_OLD_ACK and TCP_ACK_UNSENT_DATA drop reasons so that tcp_rcv_established() can report them. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6f1410b5ff13..9ff5557b1909 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -390,6 +390,8 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_CLOSE, /* TCP socket in CLOSE state */ SKB_DROP_REASON_TCP_FASTOPEN, /* dropped by FASTOPEN request socket */ SKB_DROP_REASON_TCP_OLD_ACK, /* TCP ACK is old, but in window */ + SKB_DROP_REASON_TCP_TOO_OLD_ACK, /* TCP ACK is too old */ + SKB_DROP_REASON_TCP_ACK_UNSENT_DATA, /* TCP ACK for data we haven't sent yet */ SKB_DROP_REASON_IP_OUTNOROUTES, /* route lookup failed */ SKB_DROP_REASON_BPF_CGROUP_EGRESS, /* dropped by * BPF_PROG_TYPE_CGROUP_SKB -- cgit v1.2.3 From e7c89ae4078eab24af71ba26b91642e819a4bd7f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Apr 2022 17:10:45 -0700 Subject: tcp: add drop reason support to tcp_prune_ofo_queue() Add one reason for packets dropped from OFO queue because of memory pressure. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9ff5557b1909..ad15ad208b56 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -392,6 +392,7 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_OLD_ACK, /* TCP ACK is old, but in window */ SKB_DROP_REASON_TCP_TOO_OLD_ACK, /* TCP ACK is too old */ SKB_DROP_REASON_TCP_ACK_UNSENT_DATA, /* TCP ACK for data we haven't sent yet */ + SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE, /* pruned from TCP OFO queue */ SKB_DROP_REASON_IP_OUTNOROUTES, /* route lookup failed */ SKB_DROP_REASON_BPF_CGROUP_EGRESS, /* dropped by * BPF_PROG_TYPE_CGROUP_SKB -- cgit v1.2.3 From 8fbf195798b56e1e87f62d01be636a6425c304c2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Apr 2022 17:10:48 -0700 Subject: tcp: add drop reason support to tcp_ofo_queue() packets in OFO queue might be redundant, and dropped. tcp_drop() is no longer needed. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ad15ad208b56..84d78df60453 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -393,6 +393,7 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_TOO_OLD_ACK, /* TCP ACK is too old */ SKB_DROP_REASON_TCP_ACK_UNSENT_DATA, /* TCP ACK for data we haven't sent yet */ SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE, /* pruned from TCP OFO queue */ + SKB_DROP_REASON_TCP_OFO_DROP, /* data already in receive queue */ SKB_DROP_REASON_IP_OUTNOROUTES, /* route lookup failed */ SKB_DROP_REASON_BPF_CGROUP_EGRESS, /* dropped by * BPF_PROG_TYPE_CGROUP_SKB -- cgit v1.2.3 From 2f1e85b1aee459b7d0fd981839042c6a38ffaf0c Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Sat, 16 Apr 2022 00:40:45 +0800 Subject: net: sched: use queue_mapping to pick tx queue This patch fixes issue: * If we install tc filters with act_skbedit in clsact hook. It doesn't work, because netdev_core_pick_tx() overwrites queue_mapping. $ tc filter ... action skbedit queue_mapping 1 And this patch is useful: * We can use FQ + EDT to implement efficient policies. Tx queues are picked by xps, ndo_select_queue of netdev driver, or skb hash in netdev_core_pick_tx(). In fact, the netdev driver, and skb hash are _not_ under control. xps uses the CPUs map to select Tx queues, but we can't figure out which task_struct of pod/containter running on this cpu in most case. We can use clsact filters to classify one pod/container traffic to one Tx queue. Why ? In containter networking environment, there are two kinds of pod/ containter/net-namespace. One kind (e.g. P1, P2), the high throughput is key in these applications. But avoid running out of network resource, the outbound traffic of these pods is limited, using or sharing one dedicated Tx queues assigned HTB/TBF/FQ Qdisc. Other kind of pods (e.g. Pn), the low latency of data access is key. And the traffic is not limited. Pods use or share other dedicated Tx queues assigned FIFO Qdisc. This choice provides two benefits. First, contention on the HTB/FQ Qdisc lock is significantly reduced since fewer CPUs contend for the same queue. More importantly, Qdisc contention can be eliminated completely if each CPU has its own FIFO Qdisc for the second kind of pods. There must be a mechanism in place to support classifying traffic based on pods/container to different Tx queues. Note that clsact is outside of Qdisc while Qdisc can run a classifier to select a sub-queue under the lock. In general recording the decision in the skb seems a little heavy handed. This patch introduces a per-CPU variable, suggested by Eric. The xmit.skip_txqueue flag is firstly cleared in __dev_queue_xmit(). - Tx Qdisc may install that skbedit actions, then xmit.skip_txqueue flag is set in qdisc->enqueue() though tx queue has been selected in netdev_tx_queue_mapping() or netdev_core_pick_tx(). That flag is cleared firstly in __dev_queue_xmit(), is useful: - Avoid picking Tx queue with netdev_tx_queue_mapping() in next netdev in such case: eth0 macvlan - eth0.3 vlan - eth0 ixgbe-phy: For example, eth0, macvlan in pod, which root Qdisc install skbedit queue_mapping, send packets to eth0.3, vlan in host. In __dev_queue_xmit() of eth0.3, clear the flag, does not select tx queue according to skb->queue_mapping because there is no filters in clsact or tx Qdisc of this netdev. Same action taked in eth0, ixgbe in Host. - Avoid picking Tx queue for next packet. If we set xmit.skip_txqueue in tx Qdisc (qdisc->enqueue()), the proper way to clear it is clearing it in __dev_queue_xmit when processing next packets. For performance reasons, use the static key. If user does not config the NET_EGRESS, the patch will not be compiled. +----+ +----+ +----+ | P1 | | P2 | | Pn | +----+ +----+ +----+ | | | +-----------+-----------+ | | clsact/skbedit | MQ v +-----------+-----------+ | q0 | q1 | qn v v v HTB/FQ HTB/FQ ... FIFO Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Jonathan Lemon Cc: Eric Dumazet Cc: Alexander Lobakin Cc: Paolo Abeni Cc: Talal Ahmad Cc: Kevin Hao Cc: Ilias Apalodimas Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Antoine Tenart Cc: Wei Wang Cc: Arnd Bergmann Suggested-by: Eric Dumazet Signed-off-by: Tonghao Zhang Acked-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 3 +++ include/linux/rtnetlink.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a602f29365b0..7dccbfd1bf56 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3061,6 +3061,9 @@ struct softnet_data { struct { u16 recursion; u8 more; +#ifdef CONFIG_NET_EGRESS + u8 skip_txqueue; +#endif } xmit; #ifdef CONFIG_RPS /* input_queue_head should be written by cpu owning this struct, diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 7f970b16da3a..ae2c6a3cec5d 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -100,6 +100,7 @@ void net_dec_ingress_queue(void); #ifdef CONFIG_NET_EGRESS void net_inc_egress_queue(void); void net_dec_egress_queue(void); +void netdev_xmit_skip_txqueue(bool skip); #endif void rtnetlink_init(void); -- cgit v1.2.3 From eb38c2053b67977844404cbbdee341dbf3a02d36 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 14 Mar 2022 23:09:10 +0100 Subject: can: rx-offload: rename can_rx_offload_queue_sorted() -> can_rx_offload_queue_timestamp() This patch renames the function can_rx_offload_queue_sorted() to can_rx_offload_queue_timestamp(). This better describes what the function does, it adds a newly RX'ed skb to the sorted queue by its timestamp. Link: https://lore.kernel.org/all/20220417194327.2699059-1-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/rx-offload.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h index c11477620403..c205c51d79c9 100644 --- a/include/linux/can/rx-offload.h +++ b/include/linux/can/rx-offload.h @@ -42,8 +42,8 @@ int can_rx_offload_add_manual(struct net_device *dev, int can_rx_offload_irq_offload_timestamp(struct can_rx_offload *offload, u64 reg); int can_rx_offload_irq_offload_fifo(struct can_rx_offload *offload); -int can_rx_offload_queue_sorted(struct can_rx_offload *offload, - struct sk_buff *skb, u32 timestamp); +int can_rx_offload_queue_timestamp(struct can_rx_offload *offload, + struct sk_buff *skb, u32 timestamp); unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload, unsigned int idx, u32 timestamp, unsigned int *frame_len_ptr); -- cgit v1.2.3 From 055eb95533273bc334794dbc598400d10800528f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 14 Apr 2022 09:12:33 -0700 Subject: bpf: Move rcu lock management out of BPF_PROG_RUN routines Commit 7d08c2c91171 ("bpf: Refactor BPF_PROG_RUN_ARRAY family of macros into functions") switched a bunch of BPF_PROG_RUN macros to inline routines. This changed the semantic a bit. Due to arguments expansion of macros, it used to be: rcu_read_lock(); array = rcu_dereference(cgrp->bpf.effective[atype]); ... Now, with with inline routines, we have: array_rcu = rcu_dereference(cgrp->bpf.effective[atype]); /* array_rcu can be kfree'd here */ rcu_read_lock(); array = rcu_dereference(array_rcu); I'm assuming in practice rcu subsystem isn't fast enough to trigger this but let's use rcu API properly. Also, rename to lower caps to not confuse with macros. Additionally, drop and expand BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY. See [1] for more context. [1] https://lore.kernel.org/bpf/CAKH8qBs60fOinFdxiiQikK_q0EcVxGvNTQoWvHLEUGbgcj1UYg@mail.gmail.com/T/#u v2 - keep rcu locks inside by passing cgroup_bpf Fixes: 7d08c2c91171 ("bpf: Refactor BPF_PROG_RUN_ARRAY family of macros into functions") Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220414161233.170780-1-sdf@google.com --- include/linux/bpf.h | 115 ++++------------------------------------------------ 1 file changed, 7 insertions(+), 108 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bdb5298735ce..7bf441563ffc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1221,7 +1221,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, /* an array of programs to be executed under rcu_lock. * * Typical usage: - * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, bpf_prog_run); + * ret = bpf_prog_run_array(rcu_dereference(&bpf_prog_array), ctx, bpf_prog_run); * * the structure returned by bpf_prog_array_alloc() should be populated * with program pointers and the last pointer must be NULL. @@ -1315,83 +1315,22 @@ static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx); -static __always_inline int -BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, - const void *ctx, bpf_prog_run_fn run_prog, - int retval, u32 *ret_flags) -{ - const struct bpf_prog_array_item *item; - const struct bpf_prog *prog; - const struct bpf_prog_array *array; - struct bpf_run_ctx *old_run_ctx; - struct bpf_cg_run_ctx run_ctx; - u32 func_ret; - - run_ctx.retval = retval; - migrate_disable(); - rcu_read_lock(); - array = rcu_dereference(array_rcu); - item = &array->items[0]; - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - while ((prog = READ_ONCE(item->prog))) { - run_ctx.prog_item = item; - func_ret = run_prog(prog, ctx); - if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval)) - run_ctx.retval = -EPERM; - *(ret_flags) |= (func_ret >> 1); - item++; - } - bpf_reset_run_ctx(old_run_ctx); - rcu_read_unlock(); - migrate_enable(); - return run_ctx.retval; -} - -static __always_inline int -BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, - const void *ctx, bpf_prog_run_fn run_prog, - int retval) -{ - const struct bpf_prog_array_item *item; - const struct bpf_prog *prog; - const struct bpf_prog_array *array; - struct bpf_run_ctx *old_run_ctx; - struct bpf_cg_run_ctx run_ctx; - - run_ctx.retval = retval; - migrate_disable(); - rcu_read_lock(); - array = rcu_dereference(array_rcu); - item = &array->items[0]; - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - while ((prog = READ_ONCE(item->prog))) { - run_ctx.prog_item = item; - if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval)) - run_ctx.retval = -EPERM; - item++; - } - bpf_reset_run_ctx(old_run_ctx); - rcu_read_unlock(); - migrate_enable(); - return run_ctx.retval; -} - static __always_inline u32 -BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, +bpf_prog_run_array(const struct bpf_prog_array *array, const void *ctx, bpf_prog_run_fn run_prog) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; - const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; u32 ret = 1; - migrate_disable(); - rcu_read_lock(); - array = rcu_dereference(array_rcu); + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); + if (unlikely(!array)) - goto out; + return ret; + + migrate_disable(); old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { @@ -1400,50 +1339,10 @@ BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, item++; } bpf_reset_run_ctx(old_run_ctx); -out: - rcu_read_unlock(); migrate_enable(); return ret; } -/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs - * so BPF programs can request cwr for TCP packets. - * - * Current cgroup skb programs can only return 0 or 1 (0 to drop the - * packet. This macro changes the behavior so the low order bit - * indicates whether the packet should be dropped (0) or not (1) - * and the next bit is a congestion notification bit. This could be - * used by TCP to call tcp_enter_cwr() - * - * Hence, new allowed return values of CGROUP EGRESS BPF programs are: - * 0: drop packet - * 1: keep packet - * 2: drop packet and cn - * 3: keep packet and cn - * - * This macro then converts it to one of the NET_XMIT or an error - * code that is then interpreted as drop packet (and no cn): - * 0: NET_XMIT_SUCCESS skb should be transmitted - * 1: NET_XMIT_DROP skb should be dropped and cn - * 2: NET_XMIT_CN skb should be transmitted and cn - * 3: -err skb should be dropped - */ -#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \ - ({ \ - u32 _flags = 0; \ - bool _cn; \ - u32 _ret; \ - _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, 0, &_flags); \ - _cn = _flags & BPF_RET_SET_CN; \ - if (_ret && !IS_ERR_VALUE((long)_ret)) \ - _ret = -EFAULT; \ - if (!_ret) \ - _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ - else \ - _ret = (_cn ? NET_XMIT_DROP : _ret); \ - _ret; \ - }) - #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); extern struct mutex bpf_stats_enabled_mutex; -- cgit v1.2.3 From dcf456c9a095a6e71f53d6f6f004133ee851ee70 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Mon, 18 Apr 2022 15:51:58 +0000 Subject: bpf: Fix usage of trace RCU in local storage. bpf_{sk,task,inode}_storage_free() do not need to use call_rcu_tasks_trace as no BPF program should be accessing the owner as it's being destroyed. The only other reader at this point is bpf_local_storage_map_free() which uses normal RCU. The only path that needs trace RCU are: * bpf_local_storage_{delete,update} helpers * map_{delete,update}_elem() syscalls Fixes: 0fe4b381a59e ("bpf: Allow bpf_local_storage to be used by sleepable programs") Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220418155158.2865678-1-kpsingh@kernel.org --- include/linux/bpf_local_storage.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 493e63258497..7ea18d4da84b 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -143,9 +143,9 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, - bool uncharge_omem); + bool uncharge_omem, bool use_trace_rcu); -void bpf_selem_unlink(struct bpf_local_storage_elem *selem); +void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu); void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem); -- cgit v1.2.3 From 92ece28072f18f30099770c5d4b8e300ea6820fa Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Sat, 16 Apr 2022 18:58:00 +0800 Subject: net: Change skb_ensure_writable()'s write_len param to unsigned int type Both pskb_may_pull() and skb_clone_writable()'s length parameters are of type unsigned int already. Therefore, change this function's write_len param to unsigned int type. Signed-off-by: Liu Jian Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20220416105801.88708-3-liujian56@huawei.com --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2394441fa3dd..b37c829d08e1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3885,7 +3885,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features, unsigned int offset); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); -int skb_ensure_writable(struct sk_buff *skb, int write_len); +int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len); int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); -- cgit v1.2.3 From 89e9c7280075f6733b22dd0740daeddeb1256ebf Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 20 Apr 2022 10:58:50 +0900 Subject: ipv6: Remove __ipv6_only_sock(). Since commit 9fe516ba3fb2 ("inet: move ipv6only in sock_common"), ipv6_only_sock() and __ipv6_only_sock() are the same macro. Let's remove the one. Signed-off-by: Kuniyuki Iwashima Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 918bfea4ef5f..ec5ca392eaa3 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -340,8 +340,7 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk) return (struct raw6_sock *)sk; } -#define __ipv6_only_sock(sk) (sk->sk_ipv6only) -#define ipv6_only_sock(sk) (__ipv6_only_sock(sk)) +#define ipv6_only_sock(sk) (sk->sk_ipv6only) #define ipv6_sk_rxinfo(sk) ((sk)->sk_family == PF_INET6 && \ inet6_sk(sk)->rxopt.bits.rxinfo) @@ -358,7 +357,6 @@ static inline int inet_v6_ipv6only(const struct sock *sk) return ipv6_only_sock(sk); } #else -#define __ipv6_only_sock(sk) 0 #define ipv6_only_sock(sk) 0 #define ipv6_sk_rxinfo(sk) 0 -- cgit v1.2.3 From 5e7260712b9a76de19c32f15d667f8f93e573978 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 21 Apr 2022 14:47:26 +0200 Subject: qed: Remove IP services API. qed_nvmetcp_ip_services.c and its corresponding header file were introduced in commit 806ee7f81a2b ("qed: Add IP services APIs support") but there's still no users for any of the functions they declare. Since these files are effectively unused, let's just drop them. Found by code inspection. Compile-tested only. Signed-off-by: Guillaume Nault Link: https://lore.kernel.org/r/351ac8c847980e22850eb390553f8cc0e1ccd0ce.1650545051.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/qed/qed_nvmetcp_ip_services_if.h | 29 -------------------------- 1 file changed, 29 deletions(-) delete mode 100644 include/linux/qed/qed_nvmetcp_ip_services_if.h (limited to 'include/linux') diff --git a/include/linux/qed/qed_nvmetcp_ip_services_if.h b/include/linux/qed/qed_nvmetcp_ip_services_if.h deleted file mode 100644 index 3604aee53796..000000000000 --- a/include/linux/qed/qed_nvmetcp_ip_services_if.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */ -/* - * Copyright 2021 Marvell. All rights reserved. - */ - -#ifndef _QED_IP_SERVICES_IF_H -#define _QED_IP_SERVICES_IF_H - -#include -#include -#include -#include - -int qed_route_ipv4(struct sockaddr_storage *local_addr, - struct sockaddr_storage *remote_addr, - struct sockaddr *hardware_address, - struct net_device **ndev); -int qed_route_ipv6(struct sockaddr_storage *local_addr, - struct sockaddr_storage *remote_addr, - struct sockaddr *hardware_address, - struct net_device **ndev); -void qed_vlan_get_ndev(struct net_device **ndev, u16 *vlan_id); -struct pci_dev *qed_validate_ndev(struct net_device *ndev); -void qed_return_tcp_port(struct socket *sock); -int qed_fetch_tcp_port(struct sockaddr_storage local_ip_addr, - struct socket **sock, u16 *port); -__be16 qed_get_in_port(struct sockaddr_storage *sa); - -#endif /* _QED_IP_SERVICES_IF_H */ -- cgit v1.2.3 From 5b0e58542acb3672f42e08cb1cd57784f5e08d6b Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 7 Apr 2022 12:08:54 +0200 Subject: net: ieee802154: Enhance/fix the names of the MLME return codes Let's keep these definitions as close to the specification as possible while they are not yet in use. The names get slightly longer, but we gain the minor cost of being able to search the spec more easily. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220407100903.1695973-2-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/linux/ieee802154.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h index 95c831162212..01d945c8b2e1 100644 --- a/include/linux/ieee802154.h +++ b/include/linux/ieee802154.h @@ -136,16 +136,16 @@ enum { IEEE802154_SUCCESS = 0x0, /* The beacon was lost following a synchronization request. */ - IEEE802154_BEACON_LOSS = 0xe0, + IEEE802154_BEACON_LOST = 0xe0, /* * A transmission could not take place due to activity on the * channel, i.e., the CSMA-CA mechanism has failed. */ - IEEE802154_CHNL_ACCESS_FAIL = 0xe1, + IEEE802154_CHANNEL_ACCESS_FAILURE = 0xe1, /* The GTS request has been denied by the PAN coordinator. */ - IEEE802154_DENINED = 0xe2, + IEEE802154_DENIED = 0xe2, /* The attempt to disable the transceiver has failed. */ - IEEE802154_DISABLE_TRX_FAIL = 0xe3, + IEEE802154_DISABLE_TRX_FAILURE = 0xe3, /* * The received frame induces a failed security check according to * the security suite. @@ -185,9 +185,9 @@ enum { * A PAN identifier conflict has been detected and communicated to the * PAN coordinator. */ - IEEE802154_PANID_CONFLICT = 0xee, + IEEE802154_PAN_ID_CONFLICT = 0xee, /* A coordinator realignment command has been received. */ - IEEE802154_REALIGMENT = 0xef, + IEEE802154_REALIGNMENT = 0xef, /* The transaction has expired and its information discarded. */ IEEE802154_TRANSACTION_EXPIRED = 0xf0, /* There is no capacity to store the transaction. */ @@ -203,7 +203,7 @@ enum { * A SET/GET request was issued with the identifier of a PIB attribute * that is not supported. */ - IEEE802154_UNSUPPORTED_ATTR = 0xf4, + IEEE802154_UNSUPPORTED_ATTRIBUTE = 0xf4, /* * A request to perform a scan operation failed because the MLME was * in the process of performing a previously initiated scan operation. -- cgit v1.2.3 From f06cfc233ac67e26ed137f4e098c047939cf1530 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 7 Apr 2022 12:08:55 +0200 Subject: net: ieee802154: Fill the list of MLME return codes There are more codes than already listed, let's be a bit more exhaustive. This will allow to drop device drivers local definitions of these codes. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220407100903.1695973-3-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/linux/ieee802154.h | 67 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h index 01d945c8b2e1..f1f9412b6ac6 100644 --- a/include/linux/ieee802154.h +++ b/include/linux/ieee802154.h @@ -134,7 +134,35 @@ enum { * a successful transmission. */ IEEE802154_SUCCESS = 0x0, - + /* The requested operation failed. */ + IEEE802154_MAC_ERROR = 0x1, + /* The requested operation has been cancelled. */ + IEEE802154_CANCELLED = 0x2, + /* + * Device is ready to poll the coordinator for data in a non beacon + * enabled PAN. + */ + IEEE802154_READY_FOR_POLL = 0x3, + /* Wrong frame counter. */ + IEEE802154_COUNTER_ERROR = 0xdb, + /* + * The frame does not conforms to the incoming key usage policy checking + * procedure. + */ + IEEE802154_IMPROPER_KEY_TYPE = 0xdc, + /* + * The frame does not conforms to the incoming security level usage + * policy checking procedure. + */ + IEEE802154_IMPROPER_SECURITY_LEVEL = 0xdd, + /* Secured frame received with an empty Frame Version field. */ + IEEE802154_UNSUPPORTED_LEGACY = 0xde, + /* + * A secured frame is received or must be sent but security is not + * enabled in the device. Or, the Auxiliary Security Header has security + * level of zero in it. + */ + IEEE802154_UNSUPPORTED_SECURITY = 0xdf, /* The beacon was lost following a synchronization request. */ IEEE802154_BEACON_LOST = 0xe0, /* @@ -204,11 +232,48 @@ enum { * that is not supported. */ IEEE802154_UNSUPPORTED_ATTRIBUTE = 0xf4, + /* Missing source or destination address or address mode. */ + IEEE802154_INVALID_ADDRESS = 0xf5, + /* + * MLME asked to turn the receiver on, but the on time duration is too + * big compared to the macBeaconOrder. + */ + IEEE802154_ON_TIME_TOO_LONG = 0xf6, + /* + * MLME asaked to turn the receiver on, but the request was delayed for + * too long before getting processed. + */ + IEEE802154_PAST_TIME = 0xf7, + /* + * The StartTime parameter is nonzero, and the MLME is not currently + * tracking the beacon of the coordinator through which it is + * associated. + */ + IEEE802154_TRACKING_OFF = 0xf8, + /* + * The index inside the hierarchical values in PIBAttribute is out of + * range. + */ + IEEE802154_INVALID_INDEX = 0xf9, + /* + * The number of PAN descriptors discovered during a scan has been + * reached. + */ + IEEE802154_LIMIT_REACHED = 0xfa, + /* + * The PIBAttribute parameter specifies an attribute that is a read-only + * attribute. + */ + IEEE802154_READ_ONLY = 0xfb, /* * A request to perform a scan operation failed because the MLME was * in the process of performing a previously initiated scan operation. */ IEEE802154_SCAN_IN_PROGRESS = 0xfc, + /* The outgoing superframe overlaps the incoming superframe. */ + IEEE802154_SUPERFRAME_OVERLAP = 0xfd, + /* Any other error situation. */ + IEEE802154_SYSTEM_ERROR = 0xff, }; /* frame control handling */ -- cgit v1.2.3 From d9d31cf88702ae071bec033e5c8714048aa71285 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 25 Apr 2022 15:04:48 -0700 Subject: bpf: Use bpf_prog_run_array_cg_flags everywhere Rename bpf_prog_run_array_cg_flags to bpf_prog_run_array_cg and use it everywhere. check_return_code already enforces sane return ranges for all cgroup types. (only egress and bind hooks have uncanonical return ranges, the rest is using [0, 1]) No functional changes. v2: - 'func_ret & 1' under explicit test (Andrii & Martin) Suggested-by: Alexei Starovoitov Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220425220448.3669032-1-sdf@google.com --- include/linux/bpf-cgroup.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 88a51b242adc..669d96d074ad 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -225,24 +225,20 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) \ ({ \ - u32 __unused_flags; \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ - NULL, \ - &__unused_flags); \ + NULL, NULL); \ __ret; \ }) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) \ ({ \ - u32 __unused_flags; \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ - t_ctx, \ - &__unused_flags); \ + t_ctx, NULL); \ release_sock(sk); \ } \ __ret; \ -- cgit v1.2.3 From 61df10c7799e27807ad5e459eec9d77cddf8bf45 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:49 +0530 Subject: bpf: Allow storing unreferenced kptr in map This commit introduces a new pointer type 'kptr' which can be embedded in a map value to hold a PTR_TO_BTF_ID stored by a BPF program during its invocation. When storing such a kptr, BPF program's PTR_TO_BTF_ID register must have the same type as in the map value's BTF, and loading a kptr marks the destination register as PTR_TO_BTF_ID with the correct kernel BTF and BTF ID. Such kptr are unreferenced, i.e. by the time another invocation of the BPF program loads this pointer, the object which the pointer points to may not longer exist. Since PTR_TO_BTF_ID loads (using BPF_LDX) are patched to PROBE_MEM loads by the verifier, it would safe to allow user to still access such invalid pointer, but passing such pointers into BPF helpers and kfuncs should not be permitted. A future patch in this series will close this gap. The flexibility offered by allowing programs to dereference such invalid pointers while being safe at runtime frees the verifier from doing complex lifetime tracking. As long as the user may ensure that the object remains valid, it can ensure data read by it from the kernel object is valid. The user indicates that a certain pointer must be treated as kptr capable of accepting stores of PTR_TO_BTF_ID of a certain type, by using a BTF type tag 'kptr' on the pointed to type of the pointer. Then, this information is recorded in the object BTF which will be passed into the kernel by way of map's BTF information. The name and kind from the map value BTF is used to look up the in-kernel type, and the actual BTF and BTF ID is recorded in the map struct in a new kptr_off_tab member. For now, only storing pointers to structs is permitted. An example of this specification is shown below: #define __kptr __attribute__((btf_type_tag("kptr"))) struct map_value { ... struct task_struct __kptr *task; ... }; Then, in a BPF program, user may store PTR_TO_BTF_ID with the type task_struct into the map, and then load it later. Note that the destination register is marked PTR_TO_BTF_ID_OR_NULL, as the verifier cannot know whether the value is NULL or not statically, it must treat all potential loads at that map value offset as loading a possibly NULL pointer. Only BPF_LDX, BPF_STX, and BPF_ST (with insn->imm = 0 to denote NULL) are allowed instructions that can access such a pointer. On BPF_LDX, the destination register is updated to be a PTR_TO_BTF_ID, and on BPF_STX, it is checked whether the source register type is a PTR_TO_BTF_ID with same BTF type as specified in the map BTF. The access size must always be BPF_DW. For the map in map support, the kptr_off_tab for outer map is copied from the inner map's kptr_off_tab. It was chosen to do a deep copy instead of introducing a refcount to kptr_off_tab, because the copy only needs to be done when paramterizing using inner_map_fd in the map in map case, hence would be unnecessary for all other users. It is not permitted to use MAP_FREEZE command and mmap for BPF map having kptrs, similar to the bpf_timer case. A kptr also requires that BPF program has both read and write access to the map (hence both BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG are disallowed). Note that check_map_access must be called from both check_helper_mem_access and for the BPF instructions, hence the kptr check must distinguish between ACCESS_DIRECT and ACCESS_HELPER, and reject ACCESS_HELPER cases. We rename stack_access_src to bpf_access_src and reuse it for this purpose. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-2-memxor@gmail.com --- include/linux/bpf.h | 31 ++++++++++++++++++++++++++++++- include/linux/btf.h | 2 ++ 2 files changed, 32 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7bf441563ffc..ce12124048c0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -155,6 +155,24 @@ struct bpf_map_ops { const struct bpf_iter_seq_info *iter_seq_info; }; +enum { + /* Support at most 8 pointers in a BPF map value */ + BPF_MAP_VALUE_OFF_MAX = 8, +}; + +struct bpf_map_value_off_desc { + u32 offset; + struct { + struct btf *btf; + u32 btf_id; + } kptr; +}; + +struct bpf_map_value_off { + u32 nr_off; + struct bpf_map_value_off_desc off[]; +}; + struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). @@ -171,6 +189,7 @@ struct bpf_map { u64 map_extra; /* any per-map-type extra fields */ u32 map_flags; int spin_lock_off; /* >=0 valid offset, <0 error */ + struct bpf_map_value_off *kptr_off_tab; int timer_off; /* >=0 valid offset, <0 error */ u32 id; int numa_node; @@ -184,7 +203,7 @@ struct bpf_map { char name[BPF_OBJ_NAME_LEN]; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ - /* 14 bytes hole */ + /* 6 bytes hole */ /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. @@ -217,6 +236,11 @@ static inline bool map_value_has_timer(const struct bpf_map *map) return map->timer_off >= 0; } +static inline bool map_value_has_kptrs(const struct bpf_map *map) +{ + return !IS_ERR_OR_NULL(map->kptr_off_tab); +} + static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { if (unlikely(map_value_has_spin_lock(map))) @@ -1396,6 +1420,11 @@ void bpf_prog_put(struct bpf_prog *prog); void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); +struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset); +void bpf_map_free_kptr_off_tab(struct bpf_map *map); +struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map); +bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b); + struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); diff --git a/include/linux/btf.h b/include/linux/btf.h index 36bc09b8e890..19c297f9a52f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -123,6 +123,8 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); +struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, + const struct btf_type *t); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, -- cgit v1.2.3 From 8f14852e89113d738c99c375b4c8b8b7e1073df1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:50 +0530 Subject: bpf: Tag argument to be released in bpf_func_proto Add a new type flag for bpf_arg_type that when set tells verifier that for a release function, that argument's register will be the one for which meta.ref_obj_id will be set, and which will then be released using release_reference. To capture the regno, introduce a new field release_regno in bpf_call_arg_meta. This would be required in the next patch, where we may either pass NULL or a refcounted pointer as an argument to the release function bpf_kptr_xchg. Just releasing only when meta.ref_obj_id is set is not enough, as there is a case where the type of argument needed matches, but the ref_obj_id is set to 0. Hence, we must enforce that whenever meta.ref_obj_id is zero, the register that is to be released can only be NULL for a release function. Since we now indicate whether an argument is to be released in bpf_func_proto itself, is_release_function helper has lost its utitlity, hence refactor code to work without it, and just rely on meta.release_regno to know when to release state for a ref_obj_id. Still, the restriction of one release argument and only one ref_obj_id passed to BPF helper or kfunc remains. This may be lifted in the future. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-3-memxor@gmail.com --- include/linux/bpf.h | 5 ++++- include/linux/bpf_verifier.h | 3 +-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ce12124048c0..492edd2c5713 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -366,7 +366,10 @@ enum bpf_type_flag { */ MEM_PERCPU = BIT(4 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = MEM_PERCPU, + /* Indicates that the argument will be released. */ + OBJ_RELEASE = BIT(5 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = OBJ_RELEASE, }; /* Max number of base types. */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3a9d2d7cc6b7..1f1e7f2ea967 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -523,8 +523,7 @@ int check_ptr_off_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno); int check_func_arg_reg_off(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, - enum bpf_arg_type arg_type, - bool is_release_func); + enum bpf_arg_type arg_type); int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, -- cgit v1.2.3 From c0a5a21c25f37c9fd7b36072f9968cdff1e4aa13 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:51 +0530 Subject: bpf: Allow storing referenced kptr in map Extending the code in previous commits, introduce referenced kptr support, which needs to be tagged using 'kptr_ref' tag instead. Unlike unreferenced kptr, referenced kptr have a lot more restrictions. In addition to the type matching, only a newly introduced bpf_kptr_xchg helper is allowed to modify the map value at that offset. This transfers the referenced pointer being stored into the map, releasing the references state for the program, and returning the old value and creating new reference state for the returned pointer. Similar to unreferenced pointer case, return value for this case will also be PTR_TO_BTF_ID_OR_NULL. The reference for the returned pointer must either be eventually released by calling the corresponding release function, otherwise it must be transferred into another map. It is also allowed to call bpf_kptr_xchg with a NULL pointer, to clear the value, and obtain the old value if any. BPF_LDX, BPF_STX, and BPF_ST cannot access referenced kptr. A future commit will permit using BPF_LDX for such pointers, but attempt at making it safe, since the lifetime of object won't be guaranteed. There are valid reasons to enforce the restriction of permitting only bpf_kptr_xchg to operate on referenced kptr. The pointer value must be consistent in face of concurrent modification, and any prior values contained in the map must also be released before a new one is moved into the map. To ensure proper transfer of this ownership, bpf_kptr_xchg returns the old value, which the verifier would require the user to either free or move into another map, and releases the reference held for the pointer being moved in. In the future, direct BPF_XCHG instruction may also be permitted to work like bpf_kptr_xchg helper. Note that process_kptr_func doesn't have to call check_helper_mem_access, since we already disallow rdonly/wronly flags for map, which is what check_map_access_type checks, and we already ensure the PTR_TO_MAP_VALUE refers to kptr by obtaining its off_desc, so check_map_access is also not required. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-4-memxor@gmail.com --- include/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 492edd2c5713..24310837bafc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -160,8 +160,14 @@ enum { BPF_MAP_VALUE_OFF_MAX = 8, }; +enum bpf_kptr_type { + BPF_KPTR_UNREF, + BPF_KPTR_REF, +}; + struct bpf_map_value_off_desc { u32 offset; + enum bpf_kptr_type type; struct { struct btf *btf; u32 btf_id; @@ -418,6 +424,7 @@ enum bpf_arg_type { ARG_PTR_TO_STACK, /* pointer to stack */ ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ + ARG_PTR_TO_KPTR, /* pointer to referenced kptr */ __BPF_ARG_TYPE_MAX, /* Extended arg_types. */ @@ -427,6 +434,7 @@ enum bpf_arg_type { ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM, ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, + ARG_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. -- cgit v1.2.3 From 6efe152d4061a83a2c5db6a0e5b58f9345c9742f Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:52 +0530 Subject: bpf: Prevent escaping of kptr loaded from maps While we can guarantee that even for unreferenced kptr, the object pointer points to being freed etc. can be handled by the verifier's exception handling (normal load patching to PROBE_MEM loads), we still cannot allow the user to pass these pointers to BPF helpers and kfunc, because the same exception handling won't be done for accesses inside the kernel. The same is true if a referenced pointer is loaded using normal load instruction. Since the reference is not guaranteed to be held while the pointer is used, it must be marked as untrusted. Hence introduce a new type flag, PTR_UNTRUSTED, which is used to mark all registers loading unreferenced and referenced kptr from BPF maps, and ensure they can never escape the BPF program and into the kernel by way of calling stable/unstable helpers. In check_ptr_to_btf_access, the !type_may_be_null check to reject type flags is still correct, as apart from PTR_MAYBE_NULL, only MEM_USER, MEM_PERCPU, and PTR_UNTRUSTED may be set for PTR_TO_BTF_ID. The first two are checked inside the function and rejected using a proper error message, but we still want to allow dereference of untrusted case. Also, we make sure to inherit PTR_UNTRUSTED when chain of pointers are walked, so that this flag is never dropped once it has been set on a PTR_TO_BTF_ID (i.e. trusted to untrusted transition can only be in one direction). In convert_ctx_accesses, extend the switch case to consider untrusted PTR_TO_BTF_ID in addition to normal PTR_TO_BTF_ID for PROBE_MEM conversion for BPF_LDX. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-5-memxor@gmail.com --- include/linux/bpf.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 24310837bafc..e13a5cbd4ebb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -375,7 +375,15 @@ enum bpf_type_flag { /* Indicates that the argument will be released. */ OBJ_RELEASE = BIT(5 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = OBJ_RELEASE, + /* PTR is not trusted. This is only used with PTR_TO_BTF_ID, to mark + * unreferenced and referenced kptr loaded from map value using a load + * instruction, so that they can only be dereferenced but not escape the + * BPF program into the kernel (i.e. cannot be passed as arguments to + * kfunc or bpf helpers). + */ + PTR_UNTRUSTED = BIT(6 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = PTR_UNTRUSTED, }; /* Max number of base types. */ -- cgit v1.2.3 From 4d7d7f69f4b104b2ddeec6a1e7fcfd2d044ed8c4 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:53 +0530 Subject: bpf: Adapt copy_map_value for multiple offset case Since now there might be at most 10 offsets that need handling in copy_map_value, the manual shuffling and special case is no longer going to work. Hence, let's generalise the copy_map_value function by using a sorted array of offsets to skip regions that must be avoided while copying into and out of a map value. When the map is created, we populate the offset array in struct map, Then, copy_map_value uses this sorted offset array is used to memcpy while skipping timer, spin lock, and kptr. The array is allocated as in most cases none of these special fields would be present in map value, hence we can save on space for the common case by not embedding the entire object inside bpf_map struct. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-6-memxor@gmail.com --- include/linux/bpf.h | 56 ++++++++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e13a5cbd4ebb..4e12b27a5de6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -158,6 +158,9 @@ struct bpf_map_ops { enum { /* Support at most 8 pointers in a BPF map value */ BPF_MAP_VALUE_OFF_MAX = 8, + BPF_MAP_OFF_ARR_MAX = BPF_MAP_VALUE_OFF_MAX + + 1 + /* for bpf_spin_lock */ + 1, /* for bpf_timer */ }; enum bpf_kptr_type { @@ -179,6 +182,12 @@ struct bpf_map_value_off { struct bpf_map_value_off_desc off[]; }; +struct bpf_map_off_arr { + u32 cnt; + u32 field_off[BPF_MAP_OFF_ARR_MAX]; + u8 field_sz[BPF_MAP_OFF_ARR_MAX]; +}; + struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). @@ -207,10 +216,7 @@ struct bpf_map { struct mem_cgroup *memcg; #endif char name[BPF_OBJ_NAME_LEN]; - bool bypass_spec_v1; - bool frozen; /* write-once; write-protected by freeze_mutex */ - /* 6 bytes hole */ - + struct bpf_map_off_arr *off_arr; /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ @@ -230,6 +236,8 @@ struct bpf_map { bool jited; bool xdp_has_frags; } owner; + bool bypass_spec_v1; + bool frozen; /* write-once; write-protected by freeze_mutex */ }; static inline bool map_value_has_spin_lock(const struct bpf_map *map) @@ -253,37 +261,33 @@ static inline void check_and_init_map_value(struct bpf_map *map, void *dst) memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock)); if (unlikely(map_value_has_timer(map))) memset(dst + map->timer_off, 0, sizeof(struct bpf_timer)); + if (unlikely(map_value_has_kptrs(map))) { + struct bpf_map_value_off *tab = map->kptr_off_tab; + int i; + + for (i = 0; i < tab->nr_off; i++) + *(u64 *)(dst + tab->off[i].offset) = 0; + } } /* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) { - u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0; + u32 curr_off = 0; + int i; - if (unlikely(map_value_has_spin_lock(map))) { - s_off = map->spin_lock_off; - s_sz = sizeof(struct bpf_spin_lock); - } - if (unlikely(map_value_has_timer(map))) { - t_off = map->timer_off; - t_sz = sizeof(struct bpf_timer); + if (likely(!map->off_arr)) { + memcpy(dst, src, map->value_size); + return; } - if (unlikely(s_sz || t_sz)) { - if (s_off < t_off || !s_sz) { - swap(s_off, t_off); - swap(s_sz, t_sz); - } - memcpy(dst, src, t_off); - memcpy(dst + t_off + t_sz, - src + t_off + t_sz, - s_off - t_off - t_sz); - memcpy(dst + s_off + s_sz, - src + s_off + s_sz, - map->value_size - s_off - s_sz); - } else { - memcpy(dst, src, map->value_size); + for (i = 0; i < map->off_arr->cnt; i++) { + u32 next_off = map->off_arr->field_off[i]; + + memcpy(dst + curr_off, src + curr_off, next_off - curr_off); + curr_off += map->off_arr->field_sz[i]; } + memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off); } void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); -- cgit v1.2.3 From 5ce937d613a423ca3102f53d9f3daf4210c1b6e2 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:54 +0530 Subject: bpf: Populate pairs of btf_id and destructor kfunc in btf To support storing referenced PTR_TO_BTF_ID in maps, we require associating a specific BTF ID with a 'destructor' kfunc. This is because we need to release a live referenced pointer at a certain offset in map value from the map destruction path, otherwise we end up leaking resources. Hence, introduce support for passing an array of btf_id, kfunc_btf_id pairs that denote a BTF ID and its associated release function. Then, add an accessor 'btf_find_dtor_kfunc' which can be used to look up the destructor kfunc of a certain BTF ID. If found, we can use it to free the object from the map free path. The registration of these pairs also serve as a whitelist of structures which are allowed as referenced PTR_TO_BTF_ID in a BPF map, because without finding the destructor kfunc, we will bail and return an error. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-7-memxor@gmail.com --- include/linux/btf.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 19c297f9a52f..fea424681d66 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -40,6 +40,11 @@ struct btf_kfunc_id_set { }; }; +struct btf_id_dtor_kfunc { + u32 btf_id; + u32 kfunc_btf_id; +}; + extern const struct file_operations btf_fops; void btf_get(struct btf *btf); @@ -346,6 +351,9 @@ bool btf_kfunc_id_set_contains(const struct btf *btf, enum btf_kfunc_type type, u32 kfunc_btf_id); int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, const struct btf_kfunc_id_set *s); +s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); +int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, + struct module *owner); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) @@ -369,6 +377,15 @@ static inline int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, { return 0; } +static inline s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id) +{ + return -ENOENT; +} +static inline int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, + u32 add_cnt, struct module *owner) +{ + return 0; +} #endif #endif -- cgit v1.2.3 From 14a324f6a67ef6a53e04362a70160a47eb8afffa Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:55 +0530 Subject: bpf: Wire up freeing of referenced kptr A destructor kfunc can be defined as void func(type *), where type may be void or any other pointer type as per convenience. In this patch, we ensure that the type is sane and capture the function pointer into off_desc of ptr_off_tab for the specific pointer offset, with the invariant that the dtor pointer is always set when 'kptr_ref' tag is applied to the pointer's pointee type, which is indicated by the flag BPF_MAP_VALUE_OFF_F_REF. Note that only BTF IDs whose destructor kfunc is registered, thus become the allowed BTF IDs for embedding as referenced kptr. Hence it serves the purpose of finding dtor kfunc BTF ID, as well acting as a check against the whitelist of allowed BTF IDs for this purpose. Finally, wire up the actual freeing of the referenced pointer if any at all available offsets, so that no references are leaked after the BPF map goes away and the BPF program previously moved the ownership a referenced pointer into it. The behavior is similar to BPF timers, where bpf_map_{update,delete}_elem will free any existing referenced kptr. The same case is with LRU map's bpf_lru_push_free/htab_lru_push_free functions, which are extended to reset unreferenced and free referenced kptr. Note that unlike BPF timers, kptr is not reset or freed when map uref drops to zero. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-8-memxor@gmail.com --- include/linux/bpf.h | 4 ++++ include/linux/btf.h | 2 ++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4e12b27a5de6..6141564c76c8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -23,6 +23,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -173,6 +174,8 @@ struct bpf_map_value_off_desc { enum bpf_kptr_type type; struct { struct btf *btf; + struct module *module; + btf_dtor_kfunc_t dtor; u32 btf_id; } kptr; }; @@ -1447,6 +1450,7 @@ struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u3 void bpf_map_free_kptr_off_tab(struct bpf_map *map); struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map); bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b); +void bpf_map_free_kptrs(struct bpf_map *map, void *map_value); struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); diff --git a/include/linux/btf.h b/include/linux/btf.h index fea424681d66..f70625dd5bb4 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -45,6 +45,8 @@ struct btf_id_dtor_kfunc { u32 kfunc_btf_id; }; +typedef void (*btf_dtor_kfunc_t)(void *); + extern const struct file_operations btf_fops; void btf_get(struct btf *btf); -- cgit v1.2.3 From a1ef195996526da45bbc9710849254023df75aea Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:56 +0530 Subject: bpf: Teach verifier about kptr_get kfunc helpers We introduce a new style of kfunc helpers, namely *_kptr_get, where they take pointer to the map value which points to a referenced kernel pointer contained in the map. Since this is referenced, only bpf_kptr_xchg from BPF side and xchg from kernel side is allowed to change the current value, and each pointer that resides in that location would be referenced, and RCU protected (this must be kept in mind while adding kernel types embeddable as reference kptr in BPF maps). This means that if do the load of the pointer value in an RCU read section, and find a live pointer, then as long as we hold RCU read lock, it won't be freed by a parallel xchg + release operation. This allows us to implement a safe refcount increment scheme. Hence, enforce that first argument of all such kfunc is a proper PTR_TO_MAP_VALUE pointing at the right offset to referenced pointer. For the rest of the arguments, they are subjected to typical kfunc argument checks, hence allowing some flexibility in passing more intent into how the reference should be taken. For instance, in case of struct nf_conn, it is not freed until RCU grace period ends, but can still be reused for another tuple once refcount has dropped to zero. Hence, a bpf_ct_kptr_get helper not only needs to call refcount_inc_not_zero, but also do a tuple match after incrementing the reference, and when it fails to match it, put the reference again and return NULL. This can be implemented easily if we allow passing additional parameters to the bpf_ct_kptr_get kfunc, like a struct bpf_sock_tuple * and a tuple__sz pair. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-9-memxor@gmail.com --- include/linux/btf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index f70625dd5bb4..2611cea2c2b6 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -17,6 +17,7 @@ enum btf_kfunc_type { BTF_KFUNC_TYPE_ACQUIRE, BTF_KFUNC_TYPE_RELEASE, BTF_KFUNC_TYPE_RET_NULL, + BTF_KFUNC_TYPE_KPTR_ACQUIRE, BTF_KFUNC_TYPE_MAX, }; @@ -35,6 +36,7 @@ struct btf_kfunc_id_set { struct btf_id_set *acquire_set; struct btf_id_set *release_set; struct btf_id_set *ret_null_set; + struct btf_id_set *kptr_acquire_set; }; struct btf_id_set *sets[BTF_KFUNC_TYPE_MAX]; }; -- cgit v1.2.3 From 2ab3b3808eb17f729edfd69e061667ca0a427195 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:57 +0530 Subject: bpf: Make BTF type match stricter for release arguments The current of behavior of btf_struct_ids_match for release arguments is that when type match fails, it retries with first member type again (recursively). Since the offset is already 0, this is akin to just casting the pointer in normal C, since if type matches it was just embedded inside parent sturct as an object. However, we want to reject cases for release function type matching, be it kfunc or BPF helpers. An example is the following: struct foo { struct bar b; }; struct foo *v = acq_foo(); rel_bar(&v->b); // btf_struct_ids_match fails btf_types_are_same, then // retries with first member type and succeeds, while // it should fail. Hence, don't walk the struct and only rely on btf_types_are_same for strict mode. All users of strict mode must be dealing with zero offset anyway, since otherwise they would want the struct to be walked. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-10-memxor@gmail.com --- include/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6141564c76c8..0af5793ba417 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1748,7 +1748,8 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, u32 *next_btf_id, enum bpf_type_flag *flag); bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, - const struct btf *need_btf, u32 need_type_id); + const struct btf *need_btf, u32 need_type_id, + bool strict); int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf *btf, -- cgit v1.2.3 From c317ab71facc2cd0a94145973318a4c914e11acc Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Mon, 25 Apr 2022 21:32:47 +0800 Subject: bpf: Compute map_btf_id during build time For now, the field 'map_btf_id' in 'struct bpf_map_ops' for all map types are computed during vmlinux-btf init: btf_parse_vmlinux() -> btf_vmlinux_map_ids_init() It will lookup the btf_type according to the 'map_btf_name' field in 'struct bpf_map_ops'. This process can be done during build time, thanks to Jiri's resolve_btfids. selftest of map_ptr has passed: $96 map_ptr:OK Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED Reported-by: kernel test robot Signed-off-by: Menglong Dong Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0af5793ba417..be94833d390a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -148,8 +148,7 @@ struct bpf_map_ops { bpf_callback_t callback_fn, void *callback_ctx, u64 flags); - /* BTF name and id of struct allocated by map_alloc */ - const char * const map_btf_name; + /* BTF id of struct allocated by map_alloc */ int *map_btf_id; /* bpf_iter info used to open a seq_file */ -- cgit v1.2.3 From 68822bdf76f10c3dc80609d4e2cdc1e847429086 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Apr 2022 13:12:37 -0700 Subject: net: generalize skb freeing deferral to per-cpu lists Logic added in commit f35f821935d8 ("tcp: defer skb freeing after socket lock is released") helped bulk TCP flows to move the cost of skbs frees outside of critical section where socket lock was held. But for RPC traffic, or hosts with RFS enabled, the solution is far from being ideal. For RPC traffic, recvmsg() has to return to user space right after skb payload has been consumed, meaning that BH handler has no chance to pick the skb before recvmsg() thread. This issue is more visible with BIG TCP, as more RPC fit one skb. For RFS, even if BH handler picks the skbs, they are still picked from the cpu on which user thread is running. Ideally, it is better to free the skbs (and associated page frags) on the cpu that originally allocated them. This patch removes the per socket anchor (sk->defer_list) and instead uses a per-cpu list, which will hold more skbs per round. This new per-cpu list is drained at the end of net_action_rx(), after incoming packets have been processed, to lower latencies. In normal conditions, skbs are added to the per-cpu list with no further action. In the (unlikely) cases where the cpu does not run net_action_rx() handler fast enough, we use an IPI to raise NET_RX_SOFTIRQ on the remote cpu. Also, we do not bother draining the per-cpu list from dev_cpu_dead() This is because skbs in this list have no requirement on how fast they should be freed. Note that we can add in the future a small per-cpu cache if we see any contention on sd->defer_lock. Tested on a pair of hosts with 100Gbit NIC, RFS enabled, and /proc/sys/net/ipv4/tcp_rmem[2] tuned to 16MB to work around page recycling strategy used by NIC driver (its page pool capacity being too small compared to number of skbs/pages held in sockets receive queues) Note that this tuning was only done to demonstrate worse conditions for skb freeing for this particular test. These conditions can happen in more general production workload. 10 runs of one TCP_STREAM flow Before: Average throughput: 49685 Mbit. Kernel profiles on cpu running user thread recvmsg() show high cost for skb freeing related functions (*) 57.81% [kernel] [k] copy_user_enhanced_fast_string (*) 12.87% [kernel] [k] skb_release_data (*) 4.25% [kernel] [k] __free_one_page (*) 3.57% [kernel] [k] __list_del_entry_valid 1.85% [kernel] [k] __netif_receive_skb_core 1.60% [kernel] [k] __skb_datagram_iter (*) 1.59% [kernel] [k] free_unref_page_commit (*) 1.16% [kernel] [k] __slab_free 1.16% [kernel] [k] _copy_to_iter (*) 1.01% [kernel] [k] kfree (*) 0.88% [kernel] [k] free_unref_page 0.57% [kernel] [k] ip6_rcv_core 0.55% [kernel] [k] ip6t_do_table 0.54% [kernel] [k] flush_smp_call_function_queue (*) 0.54% [kernel] [k] free_pcppages_bulk 0.51% [kernel] [k] llist_reverse_order 0.38% [kernel] [k] process_backlog (*) 0.38% [kernel] [k] free_pcp_prepare 0.37% [kernel] [k] tcp_recvmsg_locked (*) 0.37% [kernel] [k] __list_add_valid 0.34% [kernel] [k] sock_rfree 0.34% [kernel] [k] _raw_spin_lock_irq (*) 0.33% [kernel] [k] __page_cache_release 0.33% [kernel] [k] tcp_v6_rcv (*) 0.33% [kernel] [k] __put_page (*) 0.29% [kernel] [k] __mod_zone_page_state 0.27% [kernel] [k] _raw_spin_lock After patch: Average throughput: 73076 Mbit. Kernel profiles on cpu running user thread recvmsg() looks better: 81.35% [kernel] [k] copy_user_enhanced_fast_string 1.95% [kernel] [k] _copy_to_iter 1.95% [kernel] [k] __skb_datagram_iter 1.27% [kernel] [k] __netif_receive_skb_core 1.03% [kernel] [k] ip6t_do_table 0.60% [kernel] [k] sock_rfree 0.50% [kernel] [k] tcp_v6_rcv 0.47% [kernel] [k] ip6_rcv_core 0.45% [kernel] [k] read_tsc 0.44% [kernel] [k] _raw_spin_lock_irqsave 0.37% [kernel] [k] _raw_spin_lock 0.37% [kernel] [k] native_irq_return_iret 0.33% [kernel] [k] __inet6_lookup_established 0.31% [kernel] [k] ip6_protocol_deliver_rcu 0.29% [kernel] [k] tcp_rcv_established 0.29% [kernel] [k] llist_reverse_order v2: kdoc issue (kernel bots) do not defer if (alloc_cpu == smp_processor_id()) (Paolo) replace the sk_buff_head with a single-linked list (Jakub) add a READ_ONCE()/WRITE_ONCE() for the lockless read of sd->defer_list Signed-off-by: Eric Dumazet Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20220422201237.416238-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 5 +++++ include/linux/skbuff.h | 3 +++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7dccbfd1bf56..ac8a5f71220a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3081,6 +3081,11 @@ struct softnet_data { struct sk_buff_head input_pkt_queue; struct napi_struct backlog; + /* Another possibly contended cache line */ + spinlock_t defer_lock ____cacheline_aligned_in_smp; + int defer_count; + struct sk_buff *defer_list; + call_single_data_t defer_csd; }; static inline void input_queue_head_incr(struct softnet_data *sd) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 84d78df60453..5cbc184ca685 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -888,6 +888,7 @@ typedef unsigned char *sk_buff_data_t; * delivery_time at egress. * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS + * @alloc_cpu: CPU which did the skb allocation. * @secmark: security marking * @mark: Generic packet mark * @reserved_tailroom: (aka @mark) number of bytes of free space available @@ -1080,6 +1081,7 @@ struct sk_buff { unsigned int sender_cpu; }; #endif + u16 alloc_cpu; #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; #endif @@ -1321,6 +1323,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size); +void skb_attempt_defer_free(struct sk_buff *skb); struct sk_buff *napi_build_skb(void *data, unsigned int frag_size); -- cgit v1.2.3 From 657dd5f97b2ed16cdaa6339f42f9130240af1c04 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 28 Apr 2022 11:58:45 +0100 Subject: net: inline skb_zerocopy_iter_dgram skb_zerocopy_iter_dgram() is a small proxy function, inline it. For that, move __zerocopy_sg_from_iter into linux/skbuff.h Signed-off-by: Pavel Begunkov Signed-off-by: David S. Miller --- include/linux/skbuff.h | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index dc4b3e1cf21b..3270cb72e4d8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -684,20 +684,6 @@ struct ubuf_info { int mm_account_pinned_pages(struct mmpin *mmp, size_t size); void mm_unaccount_pinned_pages(struct mmpin *mmp); -struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size); -struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg); - -void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); - -void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, - bool success); - -int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len); -int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, - struct msghdr *msg, int len, - struct ubuf_info *uarg); - /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ @@ -1679,6 +1665,28 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) } #endif +struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size); +struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, + struct ubuf_info *uarg); + +void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); + +void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, + bool success); + +int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, + struct iov_iter *from, size_t length); + +static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, + struct msghdr *msg, int len) +{ + return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); +} + +int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, + struct msghdr *msg, int len, + struct ubuf_info *uarg); + /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB))) -- cgit v1.2.3 From c526fd8f9f4f21cb83c0b1c9a1ee9c0ac9be9e2e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 28 Apr 2022 11:58:46 +0100 Subject: net: inline dev_queue_xmit() Inline dev_queue_xmit() and dev_queue_xmit_accel(), they both are small proxy functions doing nothing but redirecting the control flow to __dev_queue_xmit(). Signed-off-by: Pavel Begunkov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b75ca2d095ae..4aba92a4042a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2940,10 +2940,20 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); -int dev_queue_xmit(struct sk_buff *skb); -int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev); +int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id); +static inline int dev_queue_xmit(struct sk_buff *skb) +{ + return __dev_queue_xmit(skb, NULL); +} + +static inline int dev_queue_xmit_accel(struct sk_buff *skb, + struct net_device *sb_dev) +{ + return __dev_queue_xmit(skb, sb_dev); +} + static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { int ret; -- cgit v1.2.3 From 48cec73a891cca087fbc7791c4753784180991a9 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Fri, 29 Apr 2022 09:19:53 +0200 Subject: net: lan966x: Fix compilation error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Starting from the blamed commit, the lan966x build fails with the following compilation error: drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c:342:9: error: implicit declaration of function ‘ptp_find_pin_unlocked’ [-Werror=implicit-function-declaration] 342 | pin = ptp_find_pin_unlocked(phc->clock, PTP_PF_EXTTS, 0); The issue is that there is no stub function for ptp_find_pin_unlocked in case CONFIG_PTP_1588_CLOCK is not selected. Therefore add one. Reported-by: kernel test robot Fixes: f3d8e0a9c28ba0 ("net: lan966x: Add support for PTP_PF_EXTTS") Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 554454cb8693..e8cc8b6bbf50 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -321,6 +321,10 @@ static inline int ptp_clock_index(struct ptp_clock *ptp) static inline int ptp_find_pin(struct ptp_clock *ptp, enum ptp_pin_function func, unsigned int chan) { return -1; } +static inline int ptp_find_pin_unlocked(struct ptp_clock *ptp, + enum ptp_pin_function func, + unsigned int chan) +{ return -1; } static inline int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay) { return -EOPNOTSUPP; } -- cgit v1.2.3 From 3254e0b9eb5649ffaa48717ebc9c593adc4ee6a9 Mon Sep 17 00:00:00 2001 From: Alexandru Tachici Date: Fri, 29 Apr 2022 18:34:31 +0300 Subject: ethtool: Add 10base-T1L link mode entry Add entry for the 10base-T1L full duplex mode. Reviewed-by: Andrew Lunn Reviewed-by: Oleksij Rempel Signed-off-by: Alexandru Tachici Signed-off-by: David S. Miller --- include/linux/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 36ca2b5c2253..b12af9e2f389 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -65,7 +65,7 @@ extern const int phy_basic_ports_array[3]; extern const int phy_fibre_port_array[1]; extern const int phy_all_ports_features_array[7]; extern const int phy_10_100_features_array[4]; -extern const int phy_basic_t1_features_array[2]; +extern const int phy_basic_t1_features_array[3]; extern const int phy_gbit_features_array[2]; extern const int phy_10gbit_features_array[1]; -- cgit v1.2.3 From 3da8ffd8545f62fec85a48a3c637b2f427974f11 Mon Sep 17 00:00:00 2001 From: Alexandru Tachici Date: Fri, 29 Apr 2022 18:34:34 +0300 Subject: net: phy: Add 10BASE-T1L support in phy-c45 This patch is needed because the BASE-T1 uses different registers for status, control and advertisement to those already employed in the existing phy-c45 functions. Where required, genphy_c45 functions will now check whether the device supports BASE-T1 and use the specific registers instead: 45.2.7.19 BASE-T1 AN control register, 45.2.7.20 BASE-T1 AN status, 45.2.7.21 BASE-T1 AN advertisement register, 45.2.7.22 BASE-T1 AN LP Base Page ability register, 45.2.1.185 BASE-T1 PMA/PMD control register. Tested-by: Oleksij Rempel Signed-off-by: Alexandru Tachici Signed-off-by: David S. Miller --- include/linux/mdio.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/phy.h | 3 +++ 2 files changed, 73 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index ecac96d52e01..00177567cfef 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -340,6 +340,76 @@ static inline void mii_10gbt_stat_mod_linkmode_lpa_t(unsigned long *advertising, advertising, lpa & MDIO_AN_10GBT_STAT_LP10G); } +/** + * mii_t1_adv_l_mod_linkmode_t + * @advertising: target the linkmode advertisement settings + * @lpa: value of the BASE-T1 Autonegotiation Advertisement [15:0] Register + * + * A small helper function that translates BASE-T1 Autonegotiation + * Advertisement [15:0] Register bits to linkmode advertisement settings. + * Other bits in advertising aren't changed. + */ +static inline void mii_t1_adv_l_mod_linkmode_t(unsigned long *advertising, u32 lpa) +{ + linkmode_mod_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising, + lpa & MDIO_AN_T1_ADV_L_PAUSE_CAP); + linkmode_mod_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising, + lpa & MDIO_AN_T1_ADV_L_PAUSE_ASYM); +} + +/** + * mii_t1_adv_m_mod_linkmode_t + * @advertising: target the linkmode advertisement settings + * @lpa: value of the BASE-T1 Autonegotiation Advertisement [31:16] Register + * + * A small helper function that translates BASE-T1 Autonegotiation + * Advertisement [31:16] Register bits to linkmode advertisement settings. + * Other bits in advertising aren't changed. + */ +static inline void mii_t1_adv_m_mod_linkmode_t(unsigned long *advertising, u32 lpa) +{ + linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT1L_Full_BIT, + advertising, lpa & MDIO_AN_T1_ADV_M_B10L); +} + +/** + * linkmode_adv_to_mii_t1_adv_l_t + * @advertising: the linkmode advertisement settings + * + * A small helper function that translates linkmode advertisement + * settings to phy autonegotiation advertisements for the + * BASE-T1 Autonegotiation Advertisement [15:0] Register. + */ +static inline u32 linkmode_adv_to_mii_t1_adv_l_t(unsigned long *advertising) +{ + u32 result = 0; + + if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising)) + result |= MDIO_AN_T1_ADV_L_PAUSE_CAP; + if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising)) + result |= MDIO_AN_T1_ADV_L_PAUSE_ASYM; + + return result; +} + +/** + * linkmode_adv_to_mii_t1_adv_m_t + * @advertising: the linkmode advertisement settings + * + * A small helper function that translates linkmode advertisement + * settings to phy autonegotiation advertisements for the + * BASE-T1 Autonegotiation Advertisement [31:16] Register. + */ +static inline u32 linkmode_adv_to_mii_t1_adv_m_t(unsigned long *advertising) +{ + u32 result = 0; + + if (linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT1L_Full_BIT, advertising)) + result |= MDIO_AN_T1_ADV_M_B10L; + + return result; +} + int __mdiobus_read(struct mii_bus *bus, int addr, u32 regnum); int __mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val); int __mdiobus_modify_changed(struct mii_bus *bus, int addr, u32 regnum, diff --git a/include/linux/phy.h b/include/linux/phy.h index b12af9e2f389..2d12054932ba 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -570,6 +570,7 @@ struct macsec_ops; * @autoneg_complete: Flag auto negotiation of the link has completed * @mdix: Current crossover * @mdix_ctrl: User setting of crossover + * @pma_extable: Cached value of PMA/PMD Extended Abilities Register * @interrupts: Flag interrupts have been enabled * @interface: enum phy_interface_t value * @skb: Netlink message for cable diagnostics @@ -698,6 +699,8 @@ struct phy_device { u8 mdix; u8 mdix_ctrl; + int pma_extable; + void (*phy_link_change)(struct phy_device *phydev, bool up); void (*adjust_link)(struct net_device *dev); -- cgit v1.2.3 From d639af621600dc52dd9ed0dcf62b22595f2f5b52 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 22 Mar 2022 09:16:39 +0000 Subject: net/mlx5: fs, split software and IFC flow destination definitions Separate flow destinations between software and IFC. Flow destination type passed by callers was used as the input in firmware commands and over the years software only types were added which resulted in mixing between the two. Create an IFC enum that contains only the flow destinations defined when talking to the firmware. Now that there is a proper software only enum for flow destinations the hardcoded values can be removed as the values are no longer used in firmware commands. Signed-off-by: Mark Bloch Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 11 +++++++++++ include/linux/mlx5/mlx5_ifc.h | 16 ++++++---------- 2 files changed, 17 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index e3bfed68b08a..9da9df9ae751 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -40,6 +40,17 @@ #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) +enum mlx5_flow_destination_type { + MLX5_FLOW_DESTINATION_TYPE_VPORT, + MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE, + MLX5_FLOW_DESTINATION_TYPE_TIR, + MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER, + MLX5_FLOW_DESTINATION_TYPE_UPLINK, + MLX5_FLOW_DESTINATION_TYPE_PORT, + MLX5_FLOW_DESTINATION_TYPE_COUNTER, + MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM, +}; + enum { MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO = 1 << 16, MLX5_FLOW_CONTEXT_ACTION_ENCRYPT = 1 << 17, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 7d2d0ba82144..7f4ec9faa180 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1806,16 +1806,12 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_c0[0x740]; }; -enum mlx5_flow_destination_type { - MLX5_FLOW_DESTINATION_TYPE_VPORT = 0x0, - MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE = 0x1, - MLX5_FLOW_DESTINATION_TYPE_TIR = 0x2, - MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER = 0x6, - MLX5_FLOW_DESTINATION_TYPE_UPLINK = 0x8, - - MLX5_FLOW_DESTINATION_TYPE_PORT = 0x99, - MLX5_FLOW_DESTINATION_TYPE_COUNTER = 0x100, - MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM = 0x101, +enum mlx5_ifc_flow_destination_type { + MLX5_IFC_FLOW_DESTINATION_TYPE_VPORT = 0x0, + MLX5_IFC_FLOW_DESTINATION_TYPE_FLOW_TABLE = 0x1, + MLX5_IFC_FLOW_DESTINATION_TYPE_TIR = 0x2, + MLX5_IFC_FLOW_DESTINATION_TYPE_FLOW_SAMPLER = 0x6, + MLX5_IFC_FLOW_DESTINATION_TYPE_UPLINK = 0x8, }; enum mlx5_flow_table_miss_action { -- cgit v1.2.3 From 6510bc0d7cb4119ebe10f9ab43aca6fd0e5f3e73 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 15 Mar 2022 14:08:23 +0000 Subject: net/mlx5: fs, add unused destination type When the caller doesn't pass a destination fs_core will create a unused rule just so a context can be returned. This unused rule is zeroed out and its type is 0 which can be mixed up with MLX5_FLOW_DESTINATION_TYPE_VPORT. Create a dedicated type to differentiate between the two named MLX5_FLOW_DESTINATION_TYPE_NONE. Signed-off-by: Mark Bloch Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 9da9df9ae751..8135713b0d2d 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -41,6 +41,7 @@ #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) enum mlx5_flow_destination_type { + MLX5_FLOW_DESTINATION_TYPE_NONE, MLX5_FLOW_DESTINATION_TYPE_VPORT, MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE, MLX5_FLOW_DESTINATION_TYPE_TIR, -- cgit v1.2.3 From 4c7f24f857c7cd0381dd92495db476066d1c6aec Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Sun, 1 May 2022 11:55:23 +0800 Subject: net: sysctl: introduce sysctl SYSCTL_THREE This patch introdues the SYSCTL_THREE. KUnit: [00:10:14] ================ sysctl_test (10 subtests) ================= [00:10:14] [PASSED] sysctl_test_api_dointvec_null_tbl_data [00:10:14] [PASSED] sysctl_test_api_dointvec_table_maxlen_unset [00:10:14] [PASSED] sysctl_test_api_dointvec_table_len_is_zero [00:10:14] [PASSED] sysctl_test_api_dointvec_table_read_but_position_set [00:10:14] [PASSED] sysctl_test_dointvec_read_happy_single_positive [00:10:14] [PASSED] sysctl_test_dointvec_read_happy_single_negative [00:10:14] [PASSED] sysctl_test_dointvec_write_happy_single_positive [00:10:14] [PASSED] sysctl_test_dointvec_write_happy_single_negative [00:10:14] [PASSED] sysctl_test_api_dointvec_write_single_less_int_min [00:10:14] [PASSED] sysctl_test_api_dointvec_write_single_greater_int_max [00:10:14] =================== [PASSED] sysctl_test =================== ./run_kselftest.sh -c sysctl ... ok 1 selftests: sysctl: sysctl.sh Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Hideaki YOSHIFUJI Cc: David Ahern Cc: Simon Horman Cc: Julian Anastasov Cc: Pablo Neira Ayuso Cc: Jozsef Kadlecsik Cc: Florian Westphal Cc: Shuah Khan Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Eric Dumazet Cc: Lorenz Bauer Cc: Akhmat Karakotov Signed-off-by: Tonghao Zhang Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- include/linux/sysctl.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 6353d6db69b2..80263f7cdb77 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -38,10 +38,10 @@ struct ctl_table_header; struct ctl_dir; /* Keep the same order as in fs/proc/proc_sysctl.c */ -#define SYSCTL_NEG_ONE ((void *)&sysctl_vals[0]) -#define SYSCTL_ZERO ((void *)&sysctl_vals[1]) -#define SYSCTL_ONE ((void *)&sysctl_vals[2]) -#define SYSCTL_TWO ((void *)&sysctl_vals[3]) +#define SYSCTL_ZERO ((void *)&sysctl_vals[0]) +#define SYSCTL_ONE ((void *)&sysctl_vals[1]) +#define SYSCTL_TWO ((void *)&sysctl_vals[2]) +#define SYSCTL_THREE ((void *)&sysctl_vals[3]) #define SYSCTL_FOUR ((void *)&sysctl_vals[4]) #define SYSCTL_ONE_HUNDRED ((void *)&sysctl_vals[5]) #define SYSCTL_TWO_HUNDRED ((void *)&sysctl_vals[6]) @@ -51,6 +51,7 @@ struct ctl_dir; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ #define SYSCTL_MAXOLDUID ((void *)&sysctl_vals[10]) +#define SYSCTL_NEG_ONE ((void *)&sysctl_vals[11]) extern const int sysctl_vals[]; -- cgit v1.2.3 From 58caed3dacb4354a25a1aa8d2febc3e9648ba1f4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 May 2022 16:27:03 -0700 Subject: netdev: reshuffle netif_napi_add() APIs to allow dropping weight Most drivers should not have to worry about selecting the right weight for their NAPI instances and pass NAPI_POLL_WEIGHT. It'd be best if we didn't require the argument at all and selected the default internally. This change prepares the ground for such reshuffling, allowing for a smooth transition. The following API should remain after the next release cycle: netif_napi_add() netif_napi_add_weight() netif_napi_add_tx() netif_napi_add_tx_weight() Where the _weight() variants take an explicit weight argument. I opted for a _weight() suffix rather than a __ prefix, because we use __ in places to mean that caller needs to also issue a synchronize_net() call. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20220502232703.396351-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 50 +++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4aba92a4042a..eaf66e57d891 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2499,37 +2499,53 @@ static inline void *netdev_priv(const struct net_device *dev) */ #define NAPI_POLL_WEIGHT 64 +void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight); + /** - * netif_napi_add - initialize a NAPI context - * @dev: network device - * @napi: NAPI context - * @poll: polling function - * @weight: default weight + * netif_napi_add() - initialize a NAPI context + * @dev: network device + * @napi: NAPI context + * @poll: polling function + * @weight: default weight * * netif_napi_add() must be used to initialize a NAPI context prior to calling * *any* of the other NAPI-related functions. */ -void netif_napi_add(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight); +static inline void +netif_napi_add(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) +{ + netif_napi_add_weight(dev, napi, poll, weight); +} + +static inline void +netif_napi_add_tx_weight(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), + int weight) +{ + set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state); + netif_napi_add_weight(dev, napi, poll, weight); +} + +#define netif_tx_napi_add netif_napi_add_tx_weight /** - * netif_tx_napi_add - initialize a NAPI context - * @dev: network device - * @napi: NAPI context - * @poll: polling function - * @weight: default weight + * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only + * @dev: network device + * @napi: NAPI context + * @poll: polling function * * This variant of netif_napi_add() should be used from drivers using NAPI * to exclusively poll a TX queue. * This will avoid we add it into napi_hash[], thus polluting this hash table. */ -static inline void netif_tx_napi_add(struct net_device *dev, +static inline void netif_napi_add_tx(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), - int weight) + int (*poll)(struct napi_struct *, int)) { - set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state); - netif_napi_add(dev, napi, poll, weight); + netif_napi_add_tx_weight(dev, napi, poll, NAPI_POLL_WEIGHT); } /** -- cgit v1.2.3 From c674df973ad8af2074c834788e167332d81309fa Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 8 Mar 2022 20:36:15 +0200 Subject: net/mlx5: Store IPsec ESN update work in XFRM state mlx5 IPsec code updated ESN through workqueue with allocation calls in the data path, which can be saved easily if the work is created during XFRM state initialization routine. The locking used later in the work didn't protect from anything because change of HW context is possible during XFRM state add or delete only, which can cancel work and make sure that it is not running. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/accel.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index 0f2596297f6a..a2720ebbb9fd 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -127,8 +127,8 @@ struct mlx5_accel_esp_xfrm * mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, const struct mlx5_accel_esp_xfrm_attrs *attrs); void mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm); -int mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, - const struct mlx5_accel_esp_xfrm_attrs *attrs); +void mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, + const struct mlx5_accel_esp_xfrm_attrs *attrs); #else @@ -145,9 +145,11 @@ mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, } static inline void mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm) {} -static inline int +static inline void mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, - const struct mlx5_accel_esp_xfrm_attrs *attrs) { return -EOPNOTSUPP; } + const struct mlx5_accel_esp_xfrm_attrs *attrs) +{ +} #endif /* CONFIG_MLX5_EN_IPSEC */ #endif /* __MLX5_ACCEL_H__ */ -- cgit v1.2.3 From 2ea36e2e4ad2b77bd5d45142a529b72eb5ca9156 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 8 Mar 2022 20:55:00 +0200 Subject: net/mlx5: Remove useless validity check All callers build xfrm attributes with help of mlx5e_ipsec_build_accel_xfrm_attrs() function that ensure validity of attributes. There is no need to recheck them again. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/accel.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index a2720ebbb9fd..9c511d466e55 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -36,10 +36,6 @@ #include -enum mlx5_accel_esp_aes_gcm_keymat_iv_algo { - MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ, -}; - enum mlx5_accel_esp_flags { MLX5_ACCEL_ESP_FLAGS_TUNNEL = 0, /* Default */ MLX5_ACCEL_ESP_FLAGS_TRANSPORT = 1UL << 0, @@ -57,14 +53,9 @@ enum mlx5_accel_esp_keymats { MLX5_ACCEL_ESP_KEYMAT_AES_GCM, }; -enum mlx5_accel_esp_replay { - MLX5_ACCEL_ESP_REPLAY_NONE, - MLX5_ACCEL_ESP_REPLAY_BMP, -}; struct aes_gcm_keymat { u64 seq_iv; - enum mlx5_accel_esp_aes_gcm_keymat_iv_algo iv_algo; u32 salt; u32 icv_len; @@ -81,7 +72,6 @@ struct mlx5_accel_esp_xfrm_attrs { u32 tfc_pad; u32 flags; u32 sa_handle; - enum mlx5_accel_esp_replay replay_type; union { struct { u32 size; -- cgit v1.2.3 From c6e3b421c7079af67201351c9faff62613e06f40 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 9 Mar 2022 10:35:05 +0200 Subject: net/mlx5: Merge various control path IPsec headers into one file The mlx5 IPsec code has logical separation between code that operates with XFRM objects (ipsec.c), HW objects (ipsec_offload.c), flow steering logic (ipsec_fs.c) and data path (ipsec_rxtx.c). Such separation makes sense for C-files, but isn't needed at all for H-files as they are included in batch anyway. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/accel.h | 145 --------------------------------------------- 1 file changed, 145 deletions(-) delete mode 100644 include/linux/mlx5/accel.h (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h deleted file mode 100644 index 9c511d466e55..000000000000 --- a/include/linux/mlx5/accel.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2018 Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#ifndef __MLX5_ACCEL_H__ -#define __MLX5_ACCEL_H__ - -#include - -enum mlx5_accel_esp_flags { - MLX5_ACCEL_ESP_FLAGS_TUNNEL = 0, /* Default */ - MLX5_ACCEL_ESP_FLAGS_TRANSPORT = 1UL << 0, - MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED = 1UL << 1, - MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP = 1UL << 2, -}; - -enum mlx5_accel_esp_action { - MLX5_ACCEL_ESP_ACTION_DECRYPT, - MLX5_ACCEL_ESP_ACTION_ENCRYPT, -}; - -enum mlx5_accel_esp_keymats { - MLX5_ACCEL_ESP_KEYMAT_AES_NONE, - MLX5_ACCEL_ESP_KEYMAT_AES_GCM, -}; - - -struct aes_gcm_keymat { - u64 seq_iv; - - u32 salt; - u32 icv_len; - - u32 key_len; - u32 aes_key[256 / 32]; -}; - -struct mlx5_accel_esp_xfrm_attrs { - enum mlx5_accel_esp_action action; - u32 esn; - __be32 spi; - u32 seq; - u32 tfc_pad; - u32 flags; - u32 sa_handle; - union { - struct { - u32 size; - - } bmp; - } replay; - enum mlx5_accel_esp_keymats keymat_type; - union { - struct aes_gcm_keymat aes_gcm; - } keymat; - - union { - __be32 a4; - __be32 a6[4]; - } saddr; - - union { - __be32 a4; - __be32 a6[4]; - } daddr; - - u8 is_ipv6; -}; - -struct mlx5_accel_esp_xfrm { - struct mlx5_core_dev *mdev; - struct mlx5_accel_esp_xfrm_attrs attrs; -}; - -enum mlx5_accel_ipsec_cap { - MLX5_ACCEL_IPSEC_CAP_DEVICE = 1 << 0, - MLX5_ACCEL_IPSEC_CAP_ESP = 1 << 1, - MLX5_ACCEL_IPSEC_CAP_IPV6 = 1 << 2, - MLX5_ACCEL_IPSEC_CAP_LSO = 1 << 3, - MLX5_ACCEL_IPSEC_CAP_ESN = 1 << 4, -}; - -#ifdef CONFIG_MLX5_EN_IPSEC - -u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev); - -struct mlx5_accel_esp_xfrm * -mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, - const struct mlx5_accel_esp_xfrm_attrs *attrs); -void mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm); -void mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, - const struct mlx5_accel_esp_xfrm_attrs *attrs); - -#else - -static inline u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev) -{ - return 0; -} - -static inline struct mlx5_accel_esp_xfrm * -mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, - const struct mlx5_accel_esp_xfrm_attrs *attrs) -{ - return ERR_PTR(-EOPNOTSUPP); -} -static inline void -mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm) {} -static inline void -mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, - const struct mlx5_accel_esp_xfrm_attrs *attrs) -{ -} - -#endif /* CONFIG_MLX5_EN_IPSEC */ -#endif /* __MLX5_ACCEL_H__ */ -- cgit v1.2.3 From 1c4a59b9fa98c222bd6a8fa82bff942312165c1a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 27 Mar 2022 19:23:19 +0300 Subject: net/mlx5: Remove not-supported ICV length mlx5 doesn't allow to configure any AEAD ICV length other than 128, so remove the logic that configures other unsupported values. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 7f4ec9faa180..7bab3e51c61e 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -11379,8 +11379,6 @@ enum { enum { MLX5_IPSEC_OBJECT_ICV_LEN_16B, - MLX5_IPSEC_OBJECT_ICV_LEN_12B, - MLX5_IPSEC_OBJECT_ICV_LEN_8B, }; struct mlx5_ifc_ipsec_obj_bits { -- cgit v1.2.3 From c67b627e99affe4865b13557d68152ad4c35515c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 4 May 2022 10:09:47 -0700 Subject: net: Make msg_zerocopy_alloc static msg_zerocopy_alloc is only used by msg_zerocopy_realloc; remove the export and make static in skbuff.c Signed-off-by: David Ahern Acked-by: Jonathan Lemon Link: https://lore.kernel.org/r/20220504170947.18773-1-dsahern@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3270cb72e4d8..5c2599e3fe7d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1665,7 +1665,6 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) } #endif -struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size); struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg); -- cgit v1.2.3 From 6df6398f7c8b481ce83f28143bc08a5231616deb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 5 May 2022 19:51:31 -0700 Subject: net: add netif_inherit_tso_max() To make later patches smaller create a helper for inheriting the TSO limitations of a lower device. The TSO in the name is not an accident, subsequent patches will replace GSO with TSO in more names. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eaf66e57d891..006bb5c0e413 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4895,6 +4895,9 @@ static inline void netif_set_gro_max_size(struct net_device *dev, WRITE_ONCE(dev->gro_max_size, size); } +void netif_inherit_tso_max(struct net_device *to, + const struct net_device *from); + static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) -- cgit v1.2.3 From 14d7b8122fd591693a2388b98563707ba72c6780 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 5 May 2022 19:51:32 -0700 Subject: net: don't allow user space to lift the device limits Up until commit 46e6b992c250 ("rtnetlink: allow GSO maximums to be set on device creation") the gso_max_segs and gso_max_size of a device were not controlled from user space. The quoted commit added the ability to control them because of the following setup: netns A | netns B veth<->veth eth0 If eth0 has TSO limitations and user wants to efficiently forward traffic between eth0 and the veths they should copy the TSO limitations of eth0 onto the veths. This would happen automatically for macvlans or ipvlan but veth users are not so lucky (given the loose coupling). Unfortunately the commit in question allowed users to also override the limits on real HW devices. It may be useful to control the max GSO size and someone may be using that ability (not that I know of any user), so create a separate set of knobs to reliably record the TSO limitations. Validate the user requests. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 006bb5c0e413..e12f7de6d6ae 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1917,8 +1917,10 @@ enum netdev_ml_priv_type { * @rtnl_link_ops: Rtnl_link_ops * * @gso_max_size: Maximum size of generic segmentation offload + * @tso_max_size: Device (as in HW) limit on the max TSO request size * @gso_max_segs: Maximum number of segments that can be passed to the * NIC for GSO + * @tso_max_segs: Device (as in HW) limit on the max TSO segment count * * @dcbnl_ops: Data Center Bridging netlink ops * @num_tc: Number of traffic classes in the net device @@ -2262,8 +2264,13 @@ struct net_device { /* for setting kernel sock attribute on TCP connection setup */ #define GSO_MAX_SIZE 65536 unsigned int gso_max_size; +#define TSO_LEGACY_MAX_SIZE 65536 +#define TSO_MAX_SIZE UINT_MAX + unsigned int tso_max_size; #define GSO_MAX_SEGS 65535 u16 gso_max_segs; +#define TSO_MAX_SEGS U16_MAX + u16 tso_max_segs; #ifdef CONFIG_DCB const struct dcbnl_rtnl_ops *dcbnl_ops; @@ -4895,6 +4902,8 @@ static inline void netif_set_gro_max_size(struct net_device *dev, WRITE_ONCE(dev->gro_max_size, size); } +void netif_set_tso_max_size(struct net_device *dev, unsigned int size); +void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs); void netif_inherit_tso_max(struct net_device *to, const struct net_device *from); -- cgit v1.2.3 From 744d49daf8bd3b17b345c836f2e6f97d49fa6ae8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 5 May 2022 19:51:34 -0700 Subject: net: move netif_set_gso_max helpers These are now internal to the core, no need to expose them. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e12f7de6d6ae..8cf0ac616cb9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4881,27 +4881,6 @@ static inline bool netif_needs_gso(struct sk_buff *skb, (skb->ip_summed != CHECKSUM_UNNECESSARY))); } -static inline void netif_set_gso_max_size(struct net_device *dev, - unsigned int size) -{ - /* dev->gso_max_size is read locklessly from sk_setup_caps() */ - WRITE_ONCE(dev->gso_max_size, size); -} - -static inline void netif_set_gso_max_segs(struct net_device *dev, - unsigned int segs) -{ - /* dev->gso_max_segs is read locklessly from sk_setup_caps() */ - WRITE_ONCE(dev->gso_max_segs, segs); -} - -static inline void netif_set_gro_max_size(struct net_device *dev, - unsigned int size) -{ - /* This pairs with the READ_ONCE() in skb_gro_receive() */ - WRITE_ONCE(dev->gro_max_size, size); -} - void netif_set_tso_max_size(struct net_device *dev, unsigned int size); void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs); void netif_inherit_tso_max(struct net_device *to, -- cgit v1.2.3 From 2fbdf45d7d26361a0c3ec8833fd96edf0f5812da Mon Sep 17 00:00:00 2001 From: Ricardo Martinez Date: Fri, 6 May 2022 11:12:57 -0700 Subject: list: Add list_next_entry_circular() and list_prev_entry_circular() Add macros to get the next or previous entries and wraparound if needed. For example, calling list_next_entry_circular() on the last element should return the first element in the list. Signed-off-by: Ricardo Martinez Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/list.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index dd6c2041d09c..c147eeb2d39d 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -563,6 +563,19 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_next_entry(pos, member) \ list_entry((pos)->member.next, typeof(*(pos)), member) +/** + * list_next_entry_circular - get the next element in list + * @pos: the type * to cursor. + * @head: the list head to take the element from. + * @member: the name of the list_head within the struct. + * + * Wraparound if pos is the last element (return the first element). + * Note, that list is expected to be not empty. + */ +#define list_next_entry_circular(pos, head, member) \ + (list_is_last(&(pos)->member, head) ? \ + list_first_entry(head, typeof(*(pos)), member) : list_next_entry(pos, member)) + /** * list_prev_entry - get the prev element in list * @pos: the type * to cursor @@ -571,6 +584,19 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_prev_entry(pos, member) \ list_entry((pos)->member.prev, typeof(*(pos)), member) +/** + * list_prev_entry_circular - get the prev element in list + * @pos: the type * to cursor. + * @head: the list head to take the element from. + * @member: the name of the list_head within the struct. + * + * Wraparound if pos is the first element (return the last element). + * Note, that list is expected to be not empty. + */ +#define list_prev_entry_circular(pos, head, member) \ + (list_is_first(&(pos)->member, head) ? \ + list_last_entry(head, typeof(*(pos)), member) : list_prev_entry(pos, member)) + /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. -- cgit v1.2.3 From a4ff365346c98081311c299955d91d657123e8aa Mon Sep 17 00:00:00 2001 From: Ricardo Martinez Date: Fri, 6 May 2022 11:12:58 -0700 Subject: net: skb: introduce skb_data_area_size() Helper to calculate the linear data space in the skb. Signed-off-by: Ricardo Martinez Reviewed-by: Sergey Ryazanov Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5c2599e3fe7d..d58669d6cb91 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1665,6 +1665,11 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) } #endif +static inline unsigned int skb_data_area_size(struct sk_buff *skb) +{ + return skb_end_pointer(skb) - skb->data; +} + struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg); -- cgit v1.2.3 From ca4567f1e6f660f86fcd04f3563c0045b0d4772f Mon Sep 17 00:00:00 2001 From: Alaa Mohamed Date: Thu, 5 May 2022 17:09:57 +0200 Subject: rtnetlink: add extack support in fdb del handlers Add extack support to .ndo_fdb_del in netdevice.h and all related methods. Signed-off-by: Alaa Mohamed Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8cf0ac616cb9..74c97a34921d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1513,7 +1513,7 @@ struct net_device_ops { struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, - u16 vid); + u16 vid, struct netlink_ext_ack *extack); int (*ndo_fdb_del_bulk)(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, -- cgit v1.2.3 From 90532850eb210dff70423eaf20e323a91ef542d8 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 6 May 2022 06:23:52 +0200 Subject: net: phy: introduce genphy_c45_pma_baset1_setup_master_slave() Move baset1 specific part of genphy_c45_pma_setup_forced() code to separate function to make it reusable by PHY drivers. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 2d12054932ba..d3f924d3b235 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1614,6 +1614,7 @@ int genphy_c45_read_link(struct phy_device *phydev); int genphy_c45_read_lpa(struct phy_device *phydev); int genphy_c45_read_pma(struct phy_device *phydev); int genphy_c45_pma_setup_forced(struct phy_device *phydev); +int genphy_c45_pma_baset1_setup_master_slave(struct phy_device *phydev); int genphy_c45_an_config_aneg(struct phy_device *phydev); int genphy_c45_an_disable_aneg(struct phy_device *phydev); int genphy_c45_read_mdix(struct phy_device *phydev); -- cgit v1.2.3 From b9a366f3d874ce1c9c0148ec399f2ce4ab05a467 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 6 May 2022 06:23:54 +0200 Subject: net: phy: introduce genphy_c45_pma_baset1_read_master_slave() Move baset1 specific part of genphy_c45_read_pma() code to separate function to make it reusable by PHY drivers. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index d3f924d3b235..4713c95d65fb 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1619,6 +1619,7 @@ int genphy_c45_an_config_aneg(struct phy_device *phydev); int genphy_c45_an_disable_aneg(struct phy_device *phydev); int genphy_c45_read_mdix(struct phy_device *phydev); int genphy_c45_pma_read_abilities(struct phy_device *phydev); +int genphy_c45_pma_baset1_read_master_slave(struct phy_device *phydev); int genphy_c45_read_status(struct phy_device *phydev); int genphy_c45_config_aneg(struct phy_device *phydev); int genphy_c45_loopback(struct phy_device *phydev, bool enable); -- cgit v1.2.3 From 2013ad8836aceb828c0fb5b16fa8bb98fd3d9b28 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 6 May 2022 06:23:56 +0200 Subject: net: phy: export genphy_c45_baset1_read_status() Export genphy_c45_baset1_read_status() to make it reusable by PHY drivers. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 4713c95d65fb..508f1149665b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1621,6 +1621,7 @@ int genphy_c45_read_mdix(struct phy_device *phydev); int genphy_c45_pma_read_abilities(struct phy_device *phydev); int genphy_c45_pma_baset1_read_master_slave(struct phy_device *phydev); int genphy_c45_read_status(struct phy_device *phydev); +int genphy_c45_baset1_read_status(struct phy_device *phydev); int genphy_c45_config_aneg(struct phy_device *phydev); int genphy_c45_loopback(struct phy_device *phydev, bool enable); int genphy_c45_pma_resume(struct phy_device *phydev); -- cgit v1.2.3 From 8324a02c342a36336114a497130826612ed5520d Mon Sep 17 00:00:00 2001 From: Gavin Li Date: Sun, 27 Mar 2022 17:45:32 +0300 Subject: net/mlx5: Add exit route when waiting for FW Currently, removing a device needs to get the driver interface lock before doing any cleanup. If the driver is waiting in a loop for FW init, there is no way to cancel the wait, instead the device cleanup waits for the loop to conclude and release the lock. To allow immediate response to remove device commands, check the TEARDOWN flag while waiting for FW init, and exit the loop if it has been set. Signed-off-by: Gavin Li Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ff47d49d8be4..f327d0544038 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -632,6 +632,7 @@ enum mlx5_device_state { enum mlx5_interface_state { MLX5_INTERFACE_STATE_UP = BIT(0), + MLX5_BREAK_FW_WAIT = BIT(1), }; enum mlx5_pci_status { -- cgit v1.2.3 From 34a30d7635a8e37275a7b63bec09035ed762969b Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 1 Mar 2022 15:42:01 +0000 Subject: net/mlx5: Lag, expose number of lag ports Downstream patches will add support for hardware lag with more than 2 ports. Add a way for users to query the number of lag ports. Signed-off-by: Mark Bloch Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f327d0544038..62ea1120de9c 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1142,6 +1142,7 @@ int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, int num_counters, size_t *offsets); struct mlx5_core_dev *mlx5_lag_get_peer_mdev(struct mlx5_core_dev *dev); +u8 mlx5_lag_get_num_ports(struct mlx5_core_dev *dev); struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev); void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up); int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, -- cgit v1.2.3 From 4cd14d44b11dabf195d1e66dadbb954336224658 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 1 Mar 2022 17:34:58 +0000 Subject: net/mlx5: Support devices with more than 2 ports Increase the define MLX5_MAX_PORTS to 4 as the driver is ready to support NICs with 4 ports. Signed-off-by: Mark Bloch Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 62ea1120de9c..fdb9d07a05a4 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -84,7 +84,7 @@ enum mlx5_sqp_t { }; enum { - MLX5_MAX_PORTS = 2, + MLX5_MAX_PORTS = 4, }; enum { -- cgit v1.2.3 From 7f46a0b7327ae261f9981888708dbca22c283900 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 15 Mar 2022 16:56:50 +0000 Subject: net/mlx5: Lag, add debugfs to query hardware lag state Lag state has become very complicated with many modes, flags, types and port selections methods and future work will add additional features. Add a debugfs to query the current lag state. A new directory named "lag" will be created under the mlx5 debugfs directory. As the driver has debugfs per pci function the location will be: /mlx5//lag For example: /sys/kernel/debug/mlx5/0000:08:00.0/lag The following files are exposed: - state: Returns "active" or "disabled". If "active" it means hardware lag is active. - members: Returns the BDFs of all the members of lag object. - type: Returns the type of the lag currently configured. Valid only if hardware lag is active. * "roce" - Members are bare metal PFs. * "switchdev" - Members are in switchdev mode. * "multipath" - ECMP offloads. - port_sel_mode: Returns the egress port selection method, valid only if hardware lag is active. * "queue_affinity" - Egress port is selected by the QP/SQ affinity. * "hash" - Egress port is selected by hash done on each packet. Controlled by: xmit_hash_policy of the bond device. - flags: Returns flags that are specific per lag @type. Valid only if hardware lag is active. * "shared_fdb" - "on" or "off", if "on" single FDB is used. - mapping: Returns the mapping which is used to select egress port. Valid only if hardware lag is active. If @port_sel_mode is "hash" returns the active egress ports. The hash result will select only active ports. if @port_sel_mode is "queue_affinity" returns the mapping between the configured port affinity of the QP/SQ and actual egress port. For example: * 1:1 - Mapping means if the configured affinity is port 1 traffic will egress via port 1. * 1:2 - Mapping means if the configured affinity is port 1 traffic will egress via port 2. This can happen if port 1 is down or in active/backup mode and port 1 is backup. Signed-off-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fdb9d07a05a4..d6bac3976913 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -558,6 +558,7 @@ struct mlx5_debugfs_entries { struct dentry *cq_debugfs; struct dentry *cmdif_debugfs; struct dentry *pages_debugfs; + struct dentry *lag_debugfs; }; struct mlx5_ft_pool; -- cgit v1.2.3 From 42704b26b0f1d891f6cf4ebc877dbac0d17c690d Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Fri, 6 May 2022 22:01:37 +0200 Subject: ptp: Add cycles support for virtual clocks ptp vclocks require a free running time for their timecounter. Currently only a physical clock forced to free running is supported. If vclocks are used, then the physical clock cannot be synchronized anymore. The synchronized time is not available in hardware in this case. As a result, timed transmission with TAPRIO hardware support is not possible anymore. If hardware would support a free running time additionally to the physical clock, then the physical clock does not need to be forced to free running. Thus, the physical clocks can still be synchronized while vclocks are in use. The physical clock could be used to synchronize the time domain of the TSN network and trigger TAPRIO. In parallel vclocks can be used to synchronize other time domains. Introduce support for a free running cycle counter called cycles to physical clocks. Rework ptp vclocks to use this free running cycle counter. Default implementation is based on time of physical clock. Thus, behavior of ptp vclocks based on physical clocks without free running cycle counter is identical to previous behavior. Signed-off-by: Gerhard Engleder Acked-by: Richard Cochran Signed-off-by: Paolo Abeni --- include/linux/ptp_clock_kernel.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index e8cc8b6bbf50..ad309202cf9f 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -108,6 +108,32 @@ struct ptp_system_timestamp { * @settime64: Set the current time on the hardware clock. * parameter ts: Time value to set. * + * @getcycles64: Reads the current free running cycle counter from the hardware + * clock. + * If @getcycles64 and @getcyclesx64 are not supported, then + * @gettime64 or @gettimex64 will be used as default + * implementation. + * parameter ts: Holds the result. + * + * @getcyclesx64: Reads the current free running cycle counter from the + * hardware clock and optionally also the system clock. + * If @getcycles64 and @getcyclesx64 are not supported, then + * @gettimex64 will be used as default implementation if + * available. + * parameter ts: Holds the PHC timestamp. + * parameter sts: If not NULL, it holds a pair of timestamps + * from the system clock. The first reading is made right before + * reading the lowest bits of the PHC timestamp and the second + * reading immediately follows that. + * + * @getcrosscycles: Reads the current free running cycle counter from the + * hardware clock and system clock simultaneously. + * If @getcycles64 and @getcyclesx64 are not supported, then + * @getcrosststamp will be used as default implementation if + * available. + * parameter cts: Contains timestamp (device,system) pair, + * where system time is realtime and monotonic. + * * @enable: Request driver to enable or disable an ancillary feature. * parameter request: Desired resource to enable or disable. * parameter on: Caller passes one to enable or zero to disable. @@ -155,6 +181,11 @@ struct ptp_clock_info { int (*getcrosststamp)(struct ptp_clock_info *ptp, struct system_device_crosststamp *cts); int (*settime64)(struct ptp_clock_info *p, const struct timespec64 *ts); + int (*getcycles64)(struct ptp_clock_info *ptp, struct timespec64 *ts); + int (*getcyclesx64)(struct ptp_clock_info *ptp, struct timespec64 *ts, + struct ptp_system_timestamp *sts); + int (*getcrosscycles)(struct ptp_clock_info *ptp, + struct system_device_crosststamp *cts); int (*enable)(struct ptp_clock_info *ptp, struct ptp_clock_request *request, int on); int (*verify)(struct ptp_clock_info *ptp, unsigned int pin, -- cgit v1.2.3 From 51eb7492af276b5b4d27cfa4474d40bdac7b9cf8 Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Fri, 6 May 2022 22:01:38 +0200 Subject: ptp: Request cycles for TX timestamp The free running cycle counter of physical clocks called cycles shall be used for hardware timestamps to enable synchronisation. Introduce new flag SKBTX_HW_TSTAMP_USE_CYCLES, which signals driver to provide a TX timestamp based on cycles if cycles are supported. Signed-off-by: Gerhard Engleder Acked-by: Richard Cochran Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d58669d6cb91..4a4f25975ca2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -615,6 +615,9 @@ enum { /* device driver is going to provide hardware time stamp */ SKBTX_IN_PROGRESS = 1 << 2, + /* generate hardware time stamp based on cycles if supported */ + SKBTX_HW_TSTAMP_USE_CYCLES = 1 << 3, + /* generate wifi status information (where possible) */ SKBTX_WIFI_STATUS = 1 << 4, @@ -624,7 +627,9 @@ enum { #define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \ SKBTX_SCHED_TSTAMP) -#define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP) +#define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \ + SKBTX_HW_TSTAMP_USE_CYCLES | \ + SKBTX_ANY_SW_TSTAMP) /* Definitions for flags in struct skb_shared_info */ enum { -- cgit v1.2.3 From d58809d854c9ee19e4cd41023e137e65e9dc3f94 Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Fri, 6 May 2022 22:01:39 +0200 Subject: ptp: Pass hwtstamp to ptp_convert_timestamp() ptp_convert_timestamp() converts only the timestamp hwtstamp, which is a field of the argument with the type struct skb_shared_hwtstamps *. So a pointer to the hwtstamp field of this structure is sufficient. Rework ptp_convert_timestamp() to use an argument of type ktime_t *. This allows to add additional timestamp manipulation stages before the call of ptp_convert_timestamp(). Signed-off-by: Gerhard Engleder Acked-by: Richard Cochran Signed-off-by: Paolo Abeni --- include/linux/ptp_clock_kernel.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index ad309202cf9f..92b44161408e 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -384,17 +384,16 @@ int ptp_get_vclocks_index(int pclock_index, int **vclock_index); /** * ptp_convert_timestamp() - convert timestamp to a ptp vclock time * - * @hwtstamps: skb_shared_hwtstamps structure pointer + * @hwtstamp: timestamp * @vclock_index: phc index of ptp vclock. * * Returns converted timestamp, or 0 on error. */ -ktime_t ptp_convert_timestamp(const struct skb_shared_hwtstamps *hwtstamps, - int vclock_index); +ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp, int vclock_index); #else static inline int ptp_get_vclocks_index(int pclock_index, int **vclock_index) { return 0; } -static inline ktime_t ptp_convert_timestamp(const struct skb_shared_hwtstamps *hwtstamps, +static inline ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp, int vclock_index) { return 0; } -- cgit v1.2.3 From 97dc7cd92ac67f6e05df418df1772ba4a7fbf693 Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Fri, 6 May 2022 22:01:40 +0200 Subject: ptp: Support late timestamp determination If a physical clock supports a free running cycle counter, then timestamps shall be based on this time too. For TX it is known in advance before the transmission if a timestamp based on the free running cycle counter is needed. For RX it is impossible to know which timestamp is needed before the packet is received and assigned to a socket. Support late timestamp determination by a network device. Therefore, an address/cookie is stored within the new netdev_data field of struct skb_shared_hwtstamps. This address/cookie is provided to a new network device function called ndo_get_tstamp(), which returns a timestamp based on the normal/adjustable time or based on the free running cycle counter. If function is not supported, then timestamp handling is not changed. This mechanism is intended for RX, but TX use is also possible. Signed-off-by: Gerhard Engleder Acked-by: Jonathan Lemon Acked-by: Richard Cochran Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 21 +++++++++++++++++++++ include/linux/skbuff.h | 14 +++++++++++--- 2 files changed, 32 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 74c97a34921d..51fe143091cf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1356,6 +1356,12 @@ struct netdev_net_notifier { * The caller must be under RCU read context. * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); * Get the forwarding path to reach the real device from the HW destination address + * ktime_t (*ndo_get_tstamp)(struct net_device *dev, + * const struct skb_shared_hwtstamps *hwtstamps, + * bool cycles); + * Get hardware timestamp based on normal/adjustable time or free running + * cycle counter. This function is required if physical clock supports a + * free running cycle counter. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1578,6 +1584,9 @@ struct net_device_ops { struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); + ktime_t (*ndo_get_tstamp)(struct net_device *dev, + const struct skb_shared_hwtstamps *hwtstamps, + bool cycles); }; /** @@ -4761,6 +4770,18 @@ static inline void netdev_rx_csum_fault(struct net_device *dev, void net_enable_timestamp(void); void net_disable_timestamp(void); +static inline ktime_t netdev_get_tstamp(struct net_device *dev, + const struct skb_shared_hwtstamps *hwtstamps, + bool cycles) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_get_tstamp) + return ops->ndo_get_tstamp(dev, hwtstamps, cycles); + + return hwtstamps->hwtstamp; +} + static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, struct sk_buff *skb, struct net_device *dev, bool more) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4a4f25975ca2..97de40bcfef6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -588,8 +588,10 @@ static inline bool skb_frag_must_loop(struct page *p) /** * struct skb_shared_hwtstamps - hardware time stamps - * @hwtstamp: hardware time stamp transformed into duration - * since arbitrary point in time + * @hwtstamp: hardware time stamp transformed into duration + * since arbitrary point in time + * @netdev_data: address/cookie of network device driver used as + * reference to actual hardware time stamp * * Software time stamps generated by ktime_get_real() are stored in * skb->tstamp. @@ -601,7 +603,10 @@ static inline bool skb_frag_must_loop(struct page *p) * &skb_shared_info. Use skb_hwtstamps() to get a pointer. */ struct skb_shared_hwtstamps { - ktime_t hwtstamp; + union { + ktime_t hwtstamp; + void *netdev_data; + }; }; /* Definitions for tx_flags in struct skb_shared_info */ @@ -621,6 +626,9 @@ enum { /* generate wifi status information (where possible) */ SKBTX_WIFI_STATUS = 1 << 4, + /* determine hardware time stamp based on time or cycles */ + SKBTX_HW_TSTAMP_NETDEV = 1 << 5, + /* generate software time stamp when entering packet scheduling */ SKBTX_SCHED_TSTAMP = 1 << 6, }; -- cgit v1.2.3 From 9f88361273082825d9f0d13a543d49f9fa0d44a8 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Tue, 10 May 2022 17:52:30 +0200 Subject: bpf: Add bpf_link iterator Implement bpf_link iterator to traverse links via bpf_seq_file operations. The changeset is mostly shamelessly copied from commit a228a64fc1e4 ("bpf: Add bpf_prog iterator") Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com> Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20220510155233.9815-2-9erthalion6@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index be94833d390a..551b7198ae8a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1544,6 +1544,7 @@ void bpf_link_put(struct bpf_link *link); int bpf_link_new_fd(struct bpf_link *link); struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd); struct bpf_link *bpf_link_get_from_fd(u32 ufd); +struct bpf_link *bpf_link_get_curr_or_next(u32 *id); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname, int flags); -- cgit v1.2.3 From d721def7392a7348ffb9f3583b264239cbd3702c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 10 May 2022 14:26:12 +0200 Subject: kallsyms: Make kallsyms_on_each_symbol generally available Making kallsyms_on_each_symbol generally available, so it can be used outside CONFIG_LIVEPATCH option in following changes. Rather than adding another ifdef option let's make the function generally available (when CONFIG_KALLSYMS option is defined). Cc: Christoph Hellwig Reviewed-by: Masami Hiramatsu Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220510122616.2652285-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/kallsyms.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index ce1bd2fbf23e..ad39636e0c3f 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -65,11 +65,11 @@ static inline void *dereference_symbol_descriptor(void *ptr) return ptr; } +#ifdef CONFIG_KALLSYMS int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data); -#ifdef CONFIG_KALLSYMS /* Lookup the address for a symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name); @@ -163,6 +163,11 @@ static inline bool kallsyms_show_value(const struct cred *cred) return false; } +static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, + unsigned long), void *data) +{ + return -EOPNOTSUPP; +} #endif /*CONFIG_KALLSYMS*/ static inline void print_ip_sym(const char *loglvl, unsigned long ip) -- cgit v1.2.3 From bed0d9a50dacee6fcf785c555cfb0d2573355afc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 10 May 2022 14:26:13 +0200 Subject: ftrace: Add ftrace_lookup_symbols function Adding ftrace_lookup_symbols function that resolves array of symbols with single pass over kallsyms. The user provides array of string pointers with count and pointer to allocated array for resolved values. int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) It iterates all kallsyms symbols and tries to loop up each in provided symbols array with bsearch. The symbols array needs to be sorted by name for this reason. We also check each symbol to pass ftrace_location, because this API will be used for fprobe symbols resolving. Suggested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Reviewed-by: Masami Hiramatsu Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220510122616.2652285-3-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/ftrace.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4816b7e11047..820500430eae 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -303,6 +303,8 @@ int unregister_ftrace_function(struct ftrace_ops *ops); extern void ftrace_stub(unsigned long a0, unsigned long a1, struct ftrace_ops *op, struct ftrace_regs *fregs); + +int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs); #else /* !CONFIG_FUNCTION_TRACER */ /* * (un)register_ftrace_function must be a macro since the ops parameter @@ -313,6 +315,10 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1, static inline void ftrace_kill(void) { } static inline void ftrace_free_init_mem(void) { } static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { } +static inline int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_FUNCTION_TRACER */ struct ftrace_func_entry { -- cgit v1.2.3 From ddccc9ef55992716ad477d38fbcd9f8f1d34fc67 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 May 2022 09:04:54 -0700 Subject: skbuff: add a basic intro doc Add basic skb documentation. It's mostly an intro to the subsequent patches - it would looks strange if we documented advanced topics without covering the basics in any way. Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 97de40bcfef6..69802624276d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -800,6 +800,46 @@ typedef unsigned int sk_buff_data_t; typedef unsigned char *sk_buff_data_t; #endif +/** + * DOC: Basic sk_buff geometry + * + * struct sk_buff itself is a metadata structure and does not hold any packet + * data. All the data is held in associated buffers. + * + * &sk_buff.head points to the main "head" buffer. The head buffer is divided + * into two parts: + * + * - data buffer, containing headers and sometimes payload; + * this is the part of the skb operated on by the common helpers + * such as skb_put() or skb_pull(); + * - shared info (struct skb_shared_info) which holds an array of pointers + * to read-only data in the (page, offset, length) format. + * + * Optionally &skb_shared_info.frag_list may point to another skb. + * + * Basic diagram may look like this:: + * + * --------------- + * | sk_buff | + * --------------- + * ,--------------------------- + head + * / ,----------------- + data + * / / ,----------- + tail + * | | | , + end + * | | | | + * v v v v + * ----------------------------------------------- + * | headroom | data | tailroom | skb_shared_info | + * ----------------------------------------------- + * + [page frag] + * + [page frag] + * + [page frag] + * + [page frag] --------- + * + frag_list --> | sk_buff | + * --------- + * + */ + /** * struct sk_buff - socket buffer * @next: Next buffer in list -- cgit v1.2.3 From 9ec7ea1462084df695f34c5ac2d2d2250d9d6897 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 May 2022 09:04:55 -0700 Subject: skbuff: rewrite the doc for data-only skbs The comment about shinfo->dataref split is really unhelpful, at least to me. Rewrite it and render it to skb documentation. Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 69802624276d..0e492f9bf532 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -727,16 +727,32 @@ struct skb_shared_info { skb_frag_t frags[MAX_SKB_FRAGS]; }; -/* We divide dataref into two halves. The higher 16 bits hold references - * to the payload part of skb->data. The lower 16 bits hold references to - * the entire skb->data. A clone of a headerless skb holds the length of - * the header in skb->hdr_len. - * - * All users must obey the rule that the skb->data reference count must be - * greater than or equal to the payload reference count. - * - * Holding a reference to the payload part means that the user does not - * care about modifications to the header part of skb->data. +/** + * DOC: dataref and headerless skbs + * + * Transport layers send out clones of payload skbs they hold for + * retransmissions. To allow lower layers of the stack to prepend their headers + * we split &skb_shared_info.dataref into two halves. + * The lower 16 bits count the overall number of references. + * The higher 16 bits indicate how many of the references are payload-only. + * skb_header_cloned() checks if skb is allowed to add / write the headers. + * + * The creator of the skb (e.g. TCP) marks its skb as &sk_buff.nohdr + * (via __skb_header_release()). Any clone created from marked skb will get + * &sk_buff.hdr_len populated with the available headroom. + * If there's the only clone in existence it's able to modify the headroom + * at will. The sequence of calls inside the transport layer is:: + * + * + * skb_reserve() + * __skb_header_release() + * skb_clone() + * // send the clone down the stack + * + * This is not a very generic construct and it depends on the transport layers + * doing the right thing. In practice there's usually only one payload-only skb. + * Having multiple payload-only skbs with different lengths of hdr_len is not + * possible. The payload-only skbs should never leave their owner. */ #define SKB_DATAREF_SHIFT 16 #define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1) @@ -2027,8 +2043,10 @@ static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri) } /** - * __skb_header_release - release reference to header - * @skb: buffer to operate on + * __skb_header_release() - allow clones to use the headroom + * @skb: buffer to operate on + * + * See "DOC: dataref and headerless skbs". */ static inline void __skb_header_release(struct sk_buff *skb) { -- cgit v1.2.3 From 9facd94114b59afebd405e74b3bc2bf184efe986 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 May 2022 09:04:56 -0700 Subject: skbuff: render the checksum comment to documentation Long time ago Tom added a giant comment to skbuff.h explaining checksums. Now that we have a place in Documentation for skbuff docs we should render it. Sprinkle some markup while at it. Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 219 ++++++++++++++++++++++++++++--------------------- 1 file changed, 124 insertions(+), 95 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0e492f9bf532..5296b2c37f62 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -43,98 +43,112 @@ #include #endif -/* The interface for checksum offload between the stack and networking drivers +/** + * DOC: skb checksums + * + * The interface for checksum offload between the stack and networking drivers * is as follows... * - * A. IP checksum related features + * IP checksum related features + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * Drivers advertise checksum offload capabilities in the features of a device. * From the stack's point of view these are capabilities offered by the driver. * A driver typically only advertises features that it is capable of offloading * to its device. * - * The checksum related features are: - * - * NETIF_F_HW_CSUM - The driver (or its device) is able to compute one - * IP (one's complement) checksum for any combination - * of protocols or protocol layering. The checksum is - * computed and set in a packet per the CHECKSUM_PARTIAL - * interface (see below). - * - * NETIF_F_IP_CSUM - Driver (device) is only able to checksum plain - * TCP or UDP packets over IPv4. These are specifically - * unencapsulated packets of the form IPv4|TCP or - * IPv4|UDP where the Protocol field in the IPv4 header - * is TCP or UDP. The IPv4 header may contain IP options. - * This feature cannot be set in features for a device - * with NETIF_F_HW_CSUM also set. This feature is being - * DEPRECATED (see below). - * - * NETIF_F_IPV6_CSUM - Driver (device) is only able to checksum plain - * TCP or UDP packets over IPv6. These are specifically - * unencapsulated packets of the form IPv6|TCP or - * IPv6|UDP where the Next Header field in the IPv6 - * header is either TCP or UDP. IPv6 extension headers - * are not supported with this feature. This feature - * cannot be set in features for a device with - * NETIF_F_HW_CSUM also set. This feature is being - * DEPRECATED (see below). - * - * NETIF_F_RXCSUM - Driver (device) performs receive checksum offload. - * This flag is only used to disable the RX checksum - * feature for a device. The stack will accept receive - * checksum indication in packets received on a device - * regardless of whether NETIF_F_RXCSUM is set. - * - * B. Checksumming of received packets by device. Indication of checksum - * verification is set in skb->ip_summed. Possible values are: - * - * CHECKSUM_NONE: + * .. flat-table:: Checksum related device features + * :widths: 1 10 + * + * * - %NETIF_F_HW_CSUM + * - The driver (or its device) is able to compute one + * IP (one's complement) checksum for any combination + * of protocols or protocol layering. The checksum is + * computed and set in a packet per the CHECKSUM_PARTIAL + * interface (see below). + * + * * - %NETIF_F_IP_CSUM + * - Driver (device) is only able to checksum plain + * TCP or UDP packets over IPv4. These are specifically + * unencapsulated packets of the form IPv4|TCP or + * IPv4|UDP where the Protocol field in the IPv4 header + * is TCP or UDP. The IPv4 header may contain IP options. + * This feature cannot be set in features for a device + * with NETIF_F_HW_CSUM also set. This feature is being + * DEPRECATED (see below). + * + * * - %NETIF_F_IPV6_CSUM + * - Driver (device) is only able to checksum plain + * TCP or UDP packets over IPv6. These are specifically + * unencapsulated packets of the form IPv6|TCP or + * IPv6|UDP where the Next Header field in the IPv6 + * header is either TCP or UDP. IPv6 extension headers + * are not supported with this feature. This feature + * cannot be set in features for a device with + * NETIF_F_HW_CSUM also set. This feature is being + * DEPRECATED (see below). + * + * * - %NETIF_F_RXCSUM + * - Driver (device) performs receive checksum offload. + * This flag is only used to disable the RX checksum + * feature for a device. The stack will accept receive + * checksum indication in packets received on a device + * regardless of whether NETIF_F_RXCSUM is set. + * + * Checksumming of received packets by device + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * Indication of checksum verification is set in &sk_buff.ip_summed. + * Possible values are: + * + * - %CHECKSUM_NONE * * Device did not checksum this packet e.g. due to lack of capabilities. * The packet contains full (though not verified) checksum in packet but * not in skb->csum. Thus, skb->csum is undefined in this case. * - * CHECKSUM_UNNECESSARY: + * - %CHECKSUM_UNNECESSARY * * The hardware you're dealing with doesn't calculate the full checksum - * (as in CHECKSUM_COMPLETE), but it does parse headers and verify checksums - * for specific protocols. For such packets it will set CHECKSUM_UNNECESSARY - * if their checksums are okay. skb->csum is still undefined in this case + * (as in %CHECKSUM_COMPLETE), but it does parse headers and verify checksums + * for specific protocols. For such packets it will set %CHECKSUM_UNNECESSARY + * if their checksums are okay. &sk_buff.csum is still undefined in this case * though. A driver or device must never modify the checksum field in the * packet even if checksum is verified. * - * CHECKSUM_UNNECESSARY is applicable to following protocols: - * TCP: IPv6 and IPv4. - * UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a + * %CHECKSUM_UNNECESSARY is applicable to following protocols: + * + * - TCP: IPv6 and IPv4. + * - UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a * zero UDP checksum for either IPv4 or IPv6, the networking stack * may perform further validation in this case. - * GRE: only if the checksum is present in the header. - * SCTP: indicates the CRC in SCTP header has been validated. - * FCOE: indicates the CRC in FC frame has been validated. + * - GRE: only if the checksum is present in the header. + * - SCTP: indicates the CRC in SCTP header has been validated. + * - FCOE: indicates the CRC in FC frame has been validated. * - * skb->csum_level indicates the number of consecutive checksums found in - * the packet minus one that have been verified as CHECKSUM_UNNECESSARY. + * &sk_buff.csum_level indicates the number of consecutive checksums found in + * the packet minus one that have been verified as %CHECKSUM_UNNECESSARY. * For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet * and a device is able to verify the checksums for UDP (possibly zero), - * GRE (checksum flag is set) and TCP, skb->csum_level would be set to + * GRE (checksum flag is set) and TCP, &sk_buff.csum_level would be set to * two. If the device were only able to verify the UDP checksum and not * GRE, either because it doesn't support GRE checksum or because GRE * checksum is bad, skb->csum_level would be set to zero (TCP checksum is * not considered in this case). * - * CHECKSUM_COMPLETE: + * - %CHECKSUM_COMPLETE * * This is the most generic way. The device supplied checksum of the _whole_ - * packet as seen by netif_rx() and fills in skb->csum. This means the + * packet as seen by netif_rx() and fills in &sk_buff.csum. This means the * hardware doesn't need to parse L3/L4 headers to implement this. * * Notes: + * * - Even if device supports only some protocols, but is able to produce * skb->csum, it MUST use CHECKSUM_COMPLETE, not CHECKSUM_UNNECESSARY. * - CHECKSUM_COMPLETE is not applicable to SCTP and FCoE protocols. * - * CHECKSUM_PARTIAL: + * - %CHECKSUM_PARTIAL * * A checksum is set up to be offloaded to a device as described in the * output description for CHECKSUM_PARTIAL. This may occur on a packet @@ -146,14 +160,18 @@ * packet that are after the checksum being offloaded are not considered to * be verified. * - * C. Checksumming on transmit for non-GSO. The stack requests checksum offload - * in the skb->ip_summed for a packet. Values are: + * Checksumming on transmit for non-GSO + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * The stack requests checksum offload in the &sk_buff.ip_summed for a packet. + * Values are: * - * CHECKSUM_PARTIAL: + * - %CHECKSUM_PARTIAL * * The driver is required to checksum the packet as seen by hard_start_xmit() - * from skb->csum_start up to the end, and to record/write the checksum at - * offset skb->csum_start + skb->csum_offset. A driver may verify that the + * from &sk_buff.csum_start up to the end, and to record/write the checksum at + * offset &sk_buff.csum_start + &sk_buff.csum_offset. + * A driver may verify that the * csum_start and csum_offset values are valid values given the length and * offset of the packet, but it should not attempt to validate that the * checksum refers to a legitimate transport layer checksum -- it is the @@ -165,55 +183,66 @@ * checksum calculation to the device, or call skb_checksum_help (in the case * that the device does not support offload for a particular checksum). * - * NETIF_F_IP_CSUM and NETIF_F_IPV6_CSUM are being deprecated in favor of - * NETIF_F_HW_CSUM. New devices should use NETIF_F_HW_CSUM to indicate + * %NETIF_F_IP_CSUM and %NETIF_F_IPV6_CSUM are being deprecated in favor of + * %NETIF_F_HW_CSUM. New devices should use %NETIF_F_HW_CSUM to indicate * checksum offload capability. - * skb_csum_hwoffload_help() can be called to resolve CHECKSUM_PARTIAL based + * skb_csum_hwoffload_help() can be called to resolve %CHECKSUM_PARTIAL based * on network device checksumming capabilities: if a packet does not match - * them, skb_checksum_help or skb_crc32c_help (depending on the value of - * csum_not_inet, see item D.) is called to resolve the checksum. + * them, skb_checksum_help() or skb_crc32c_help() (depending on the value of + * &sk_buff.csum_not_inet, see :ref:`crc`) + * is called to resolve the checksum. * - * CHECKSUM_NONE: + * - %CHECKSUM_NONE * * The skb was already checksummed by the protocol, or a checksum is not * required. * - * CHECKSUM_UNNECESSARY: + * - %CHECKSUM_UNNECESSARY * * This has the same meaning as CHECKSUM_NONE for checksum offload on * output. * - * CHECKSUM_COMPLETE: + * - %CHECKSUM_COMPLETE + * * Not used in checksum output. If a driver observes a packet with this value - * set in skbuff, it should treat the packet as if CHECKSUM_NONE were set. - * - * D. Non-IP checksum (CRC) offloads - * - * NETIF_F_SCTP_CRC - This feature indicates that a device is capable of - * offloading the SCTP CRC in a packet. To perform this offload the stack - * will set csum_start and csum_offset accordingly, set ip_summed to - * CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication in - * the skbuff that the CHECKSUM_PARTIAL refers to CRC32c. - * A driver that supports both IP checksum offload and SCTP CRC32c offload - * must verify which offload is configured for a packet by testing the - * value of skb->csum_not_inet; skb_crc32c_csum_help is provided to resolve - * CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1. - * - * NETIF_F_FCOE_CRC - This feature indicates that a device is capable of - * offloading the FCOE CRC in a packet. To perform this offload the stack - * will set ip_summed to CHECKSUM_PARTIAL and set csum_start and csum_offset - * accordingly. Note that there is no indication in the skbuff that the - * CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports - * both IP checksum offload and FCOE CRC offload must verify which offload - * is configured for a packet, presumably by inspecting packet headers. - * - * E. Checksumming on output with GSO. - * - * In the case of a GSO packet (skb_is_gso(skb) is true), checksum offload + * set in skbuff, it should treat the packet as if %CHECKSUM_NONE were set. + * + * .. _crc: + * + * Non-IP checksum (CRC) offloads + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * .. flat-table:: + * :widths: 1 10 + * + * * - %NETIF_F_SCTP_CRC + * - This feature indicates that a device is capable of + * offloading the SCTP CRC in a packet. To perform this offload the stack + * will set csum_start and csum_offset accordingly, set ip_summed to + * %CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication + * in the skbuff that the %CHECKSUM_PARTIAL refers to CRC32c. + * A driver that supports both IP checksum offload and SCTP CRC32c offload + * must verify which offload is configured for a packet by testing the + * value of &sk_buff.csum_not_inet; skb_crc32c_csum_help() is provided to + * resolve %CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1. + * + * * - %NETIF_F_FCOE_CRC + * - This feature indicates that a device is capable of offloading the FCOE + * CRC in a packet. To perform this offload the stack will set ip_summed + * to %CHECKSUM_PARTIAL and set csum_start and csum_offset + * accordingly. Note that there is no indication in the skbuff that the + * %CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports + * both IP checksum offload and FCOE CRC offload must verify which offload + * is configured for a packet, presumably by inspecting packet headers. + * + * Checksumming on output with GSO + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * In the case of a GSO packet (skb_is_gso() is true), checksum offload * is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the - * gso_type is SKB_GSO_TCPV4 or SKB_GSO_TCPV6, TCP checksum offload as + * gso_type is %SKB_GSO_TCPV4 or %SKB_GSO_TCPV6, TCP checksum offload as * part of the GSO operation is implied. If a checksum is being offloaded - * with GSO then ip_summed is CHECKSUM_PARTIAL, and both csum_start and + * with GSO then ip_summed is %CHECKSUM_PARTIAL, and both csum_start and * csum_offset are set to refer to the outermost checksum being offloaded * (two offloaded checksums are possible with UDP encapsulation). */ -- cgit v1.2.3 From f7e0beaf39d3868dc700d4954b26cf8443c5d423 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Tue, 10 May 2022 13:59:19 -0700 Subject: bpf, x86: Generate trampolines from bpf_tramp_links Replace struct bpf_tramp_progs with struct bpf_tramp_links to collect struct bpf_tramp_link(s) for a trampoline. struct bpf_tramp_link extends bpf_link to act as a linked list node. arch_prepare_bpf_trampoline() accepts a struct bpf_tramp_links to collects all bpf_tramp_link(s) that a trampoline should call. Change BPF trampoline and bpf_struct_ops to pass bpf_tramp_links instead of bpf_tramp_progs. Signed-off-by: Kui-Feng Lee Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220510205923.3206889-2-kuifeng@fb.com --- include/linux/bpf.h | 36 ++++++++++++++++++++++++------------ include/linux/bpf_types.h | 1 + 2 files changed, 25 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 551b7198ae8a..75e0110a65e1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -723,11 +723,11 @@ struct btf_func_model { /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 */ -#define BPF_MAX_TRAMP_PROGS 38 +#define BPF_MAX_TRAMP_LINKS 38 -struct bpf_tramp_progs { - struct bpf_prog *progs[BPF_MAX_TRAMP_PROGS]; - int nr_progs; +struct bpf_tramp_links { + struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS]; + int nr_links; }; /* Different use cases for BPF trampoline: @@ -753,7 +753,7 @@ struct bpf_tramp_progs { struct bpf_tramp_image; int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_progs *tprogs, + struct bpf_tramp_links *tlinks, void *orig_call); /* these two functions are called from generated trampoline */ u64 notrace __bpf_prog_enter(struct bpf_prog *prog); @@ -852,9 +852,10 @@ static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func( { return bpf_func(ctx, insnsi); } + #ifdef CONFIG_BPF_JIT -int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr); -int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr); +int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); +int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); @@ -905,12 +906,12 @@ int bpf_jit_charge_modmem(u32 size); void bpf_jit_uncharge_modmem(u32 size); bool bpf_prog_has_trampoline(const struct bpf_prog *prog); #else -static inline int bpf_trampoline_link_prog(struct bpf_prog *prog, +static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { return -ENOTSUPP; } -static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog, +static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { return -ENOTSUPP; @@ -1009,7 +1010,6 @@ struct bpf_prog_aux { bool tail_call_reachable; bool xdp_has_frags; bool use_bpf_prog_pack; - struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; /* function name for valid attach_btf_id */ @@ -1096,6 +1096,18 @@ struct bpf_link_ops { struct bpf_link_info *info); }; +struct bpf_tramp_link { + struct bpf_link link; + struct hlist_node tramp_hlist; +}; + +struct bpf_tracing_link { + struct bpf_tramp_link link; + enum bpf_attach_type attach_type; + struct bpf_trampoline *trampoline; + struct bpf_prog *tgt_prog; +}; + struct bpf_link_primer { struct bpf_link *link; struct file *file; @@ -1133,8 +1145,8 @@ bool bpf_struct_ops_get(const void *kdata); void bpf_struct_ops_put(const void *kdata); int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value); -int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs, - struct bpf_prog *prog, +int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, + struct bpf_tramp_link *link, const struct btf_func_model *model, void *image, void *image_end); static inline bool bpf_try_module_get(const void *data, struct module *owner) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 3e24ad0c4b3c..2b9112b80171 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -141,3 +141,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp) BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf) #endif BPF_LINK_TYPE(BPF_LINK_TYPE_KPROBE_MULTI, kprobe_multi) +BPF_LINK_TYPE(BPF_LINK_TYPE_STRUCT_OPS, struct_ops) -- cgit v1.2.3 From e384c7b7b46d0a5f4bf3c554f963e6e9622d0ab1 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Tue, 10 May 2022 13:59:20 -0700 Subject: bpf, x86: Create bpf_tramp_run_ctx on the caller thread's stack BPF trampolines will create a bpf_tramp_run_ctx, a bpf_run_ctx, on stacks and set/reset the current bpf_run_ctx before/after calling a bpf_prog. Signed-off-by: Kui-Feng Lee Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220510205923.3206889-3-kuifeng@fb.com --- include/linux/bpf.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 75e0110a65e1..256fb802e580 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -730,6 +730,8 @@ struct bpf_tramp_links { int nr_links; }; +struct bpf_tramp_run_ctx; + /* Different use cases for BPF trampoline: * 1. replace nop at the function entry (kprobe equivalent) * flags = BPF_TRAMP_F_RESTORE_REGS @@ -756,10 +758,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *i struct bpf_tramp_links *tlinks, void *orig_call); /* these two functions are called from generated trampoline */ -u64 notrace __bpf_prog_enter(struct bpf_prog *prog); -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); -u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog); -void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start); +u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); @@ -1351,6 +1354,12 @@ struct bpf_trace_run_ctx { u64 bpf_cookie; }; +struct bpf_tramp_run_ctx { + struct bpf_run_ctx run_ctx; + u64 bpf_cookie; + struct bpf_run_ctx *saved_run_ctx; +}; + static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) { struct bpf_run_ctx *old_ctx = NULL; -- cgit v1.2.3 From 2fcc82411e74e5e6aba336561cf56fb899bfae4e Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Tue, 10 May 2022 13:59:21 -0700 Subject: bpf, x86: Attach a cookie to fentry/fexit/fmod_ret/lsm. Pass a cookie along with BPF_LINK_CREATE requests. Add a bpf_cookie field to struct bpf_tracing_link to attach a cookie. The cookie of a bpf_tracing_link is available by calling bpf_get_attach_cookie when running the BPF program of the attached link. The value of a cookie will be set at bpf_tramp_run_ctx by the trampoline of the link. Signed-off-by: Kui-Feng Lee Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220510205923.3206889-4-kuifeng@fb.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 256fb802e580..aba7ded56436 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1102,6 +1102,7 @@ struct bpf_link_ops { struct bpf_tramp_link { struct bpf_link link; struct hlist_node tramp_hlist; + u64 cookie; }; struct bpf_tracing_link { -- cgit v1.2.3 From 5b87be9e4978fe507c7e14c0bdff147d10d39aec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 May 2022 20:57:37 -0700 Subject: net: add include/net/net_debug.h Remove from include/linux/netdevice.h helpers that send debug/info/warnings to syslog. We plan adding more helpers in following patches. v2: added two includes, and 'struct net_device' forward declaration to avoid compile errors (kernel bots) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 141 +--------------------------------------------- 1 file changed, 1 insertion(+), 140 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 51fe143091cf..536321691c72 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -50,6 +50,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -5071,81 +5072,9 @@ static inline const char *netdev_reg_state(const struct net_device *dev) return " (unknown)"; } -__printf(3, 4) __cold -void netdev_printk(const char *level, const struct net_device *dev, - const char *format, ...); -__printf(2, 3) __cold -void netdev_emerg(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_alert(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_crit(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_err(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_warn(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_notice(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_info(const struct net_device *dev, const char *format, ...); - -#define netdev_level_once(level, dev, fmt, ...) \ -do { \ - static bool __section(".data.once") __print_once; \ - \ - if (!__print_once) { \ - __print_once = true; \ - netdev_printk(level, dev, fmt, ##__VA_ARGS__); \ - } \ -} while (0) - -#define netdev_emerg_once(dev, fmt, ...) \ - netdev_level_once(KERN_EMERG, dev, fmt, ##__VA_ARGS__) -#define netdev_alert_once(dev, fmt, ...) \ - netdev_level_once(KERN_ALERT, dev, fmt, ##__VA_ARGS__) -#define netdev_crit_once(dev, fmt, ...) \ - netdev_level_once(KERN_CRIT, dev, fmt, ##__VA_ARGS__) -#define netdev_err_once(dev, fmt, ...) \ - netdev_level_once(KERN_ERR, dev, fmt, ##__VA_ARGS__) -#define netdev_warn_once(dev, fmt, ...) \ - netdev_level_once(KERN_WARNING, dev, fmt, ##__VA_ARGS__) -#define netdev_notice_once(dev, fmt, ...) \ - netdev_level_once(KERN_NOTICE, dev, fmt, ##__VA_ARGS__) -#define netdev_info_once(dev, fmt, ...) \ - netdev_level_once(KERN_INFO, dev, fmt, ##__VA_ARGS__) - #define MODULE_ALIAS_NETDEV(device) \ MODULE_ALIAS("netdev-" device) -#if defined(CONFIG_DYNAMIC_DEBUG) || \ - (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) -#define netdev_dbg(__dev, format, args...) \ -do { \ - dynamic_netdev_dbg(__dev, format, ##args); \ -} while (0) -#elif defined(DEBUG) -#define netdev_dbg(__dev, format, args...) \ - netdev_printk(KERN_DEBUG, __dev, format, ##args) -#else -#define netdev_dbg(__dev, format, args...) \ -({ \ - if (0) \ - netdev_printk(KERN_DEBUG, __dev, format, ##args); \ -}) -#endif - -#if defined(VERBOSE_DEBUG) -#define netdev_vdbg netdev_dbg -#else - -#define netdev_vdbg(dev, format, args...) \ -({ \ - if (0) \ - netdev_printk(KERN_DEBUG, dev, format, ##args); \ - 0; \ -}) -#endif - /* * netdev_WARN() acts like dev_printk(), but with the key difference * of using a WARN/WARN_ON to get the message out, including the @@ -5159,74 +5088,6 @@ do { \ WARN_ONCE(1, "netdevice: %s%s: " format, netdev_name(dev), \ netdev_reg_state(dev), ##args) -/* netif printk helpers, similar to netdev_printk */ - -#define netif_printk(priv, type, level, dev, fmt, args...) \ -do { \ - if (netif_msg_##type(priv)) \ - netdev_printk(level, (dev), fmt, ##args); \ -} while (0) - -#define netif_level(level, priv, type, dev, fmt, args...) \ -do { \ - if (netif_msg_##type(priv)) \ - netdev_##level(dev, fmt, ##args); \ -} while (0) - -#define netif_emerg(priv, type, dev, fmt, args...) \ - netif_level(emerg, priv, type, dev, fmt, ##args) -#define netif_alert(priv, type, dev, fmt, args...) \ - netif_level(alert, priv, type, dev, fmt, ##args) -#define netif_crit(priv, type, dev, fmt, args...) \ - netif_level(crit, priv, type, dev, fmt, ##args) -#define netif_err(priv, type, dev, fmt, args...) \ - netif_level(err, priv, type, dev, fmt, ##args) -#define netif_warn(priv, type, dev, fmt, args...) \ - netif_level(warn, priv, type, dev, fmt, ##args) -#define netif_notice(priv, type, dev, fmt, args...) \ - netif_level(notice, priv, type, dev, fmt, ##args) -#define netif_info(priv, type, dev, fmt, args...) \ - netif_level(info, priv, type, dev, fmt, ##args) - -#if defined(CONFIG_DYNAMIC_DEBUG) || \ - (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) -#define netif_dbg(priv, type, netdev, format, args...) \ -do { \ - if (netif_msg_##type(priv)) \ - dynamic_netdev_dbg(netdev, format, ##args); \ -} while (0) -#elif defined(DEBUG) -#define netif_dbg(priv, type, dev, format, args...) \ - netif_printk(priv, type, KERN_DEBUG, dev, format, ##args) -#else -#define netif_dbg(priv, type, dev, format, args...) \ -({ \ - if (0) \ - netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \ - 0; \ -}) -#endif - -/* if @cond then downgrade to debug, else print at @level */ -#define netif_cond_dbg(priv, type, netdev, cond, level, fmt, args...) \ - do { \ - if (cond) \ - netif_dbg(priv, type, netdev, fmt, ##args); \ - else \ - netif_ ## level(priv, type, netdev, fmt, ##args); \ - } while (0) - -#if defined(VERBOSE_DEBUG) -#define netif_vdbg netif_dbg -#else -#define netif_vdbg(priv, type, dev, format, args...) \ -({ \ - if (0) \ - netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \ - 0; \ -}) -#endif - /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. -- cgit v1.2.3 From 66e4c8d950083df8e12981babca788e1635c92b6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 May 2022 20:57:39 -0700 Subject: net: warn if transport header was not set Make sure skb_transport_header() and skb_transport_offset() uses are not fooled if the transport header has not been set. This change will likely expose existing bugs in linux networking stacks. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5296b2c37f62..b91d225fdc13 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -42,6 +42,7 @@ #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include #endif +#include /** * DOC: skb checksums @@ -2904,6 +2905,7 @@ static inline bool skb_transport_header_was_set(const struct sk_buff *skb) static inline unsigned char *skb_transport_header(const struct sk_buff *skb) { + DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb)); return skb->head + skb->transport_header; } -- cgit v1.2.3 From 5b74c690e1c55953ec99fd9dab74f72dbee4fe95 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 12 May 2022 01:16:51 +0530 Subject: bpf: Fix sparse warning for bpf_kptr_xchg_proto Kernel Test Robot complained about missing static storage class annotation for bpf_kptr_xchg_proto variable. sparse: symbol 'bpf_kptr_xchg_proto' was not declared. Should it be static? This caused by missing extern definition in the header. Add it to suppress the sparse warning. Fixes: c0a5a21c25f3 ("bpf: Allow storing referenced kptr in map") Reported-by: kernel test robot Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220511194654.765705-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aba7ded56436..3ded8711457f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2246,6 +2246,7 @@ extern const struct bpf_func_proto bpf_find_vma_proto; extern const struct bpf_func_proto bpf_loop_proto; extern const struct bpf_func_proto bpf_strncmp_proto; extern const struct bpf_func_proto bpf_copy_from_user_task_proto; +extern const struct bpf_func_proto bpf_kptr_xchg_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); -- cgit v1.2.3 From 07343110b293456d30393e89b86c4dee1ac051c8 Mon Sep 17 00:00:00 2001 From: Feng Zhou Date: Wed, 11 May 2022 17:38:53 +0800 Subject: bpf: add bpf_map_lookup_percpu_elem for percpu map Add new ebpf helpers bpf_map_lookup_percpu_elem. The implementation method is relatively simple, refer to the implementation method of map_lookup_elem of percpu map, increase the parameters of cpu, and obtain it according to the specified cpu. Signed-off-by: Feng Zhou Link: https://lore.kernel.org/r/20220511093854.411-2-zhoufeng.zf@bytedance.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3ded8711457f..5061ccd8b2dc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -89,6 +89,7 @@ struct bpf_map_ops { int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); int (*map_pop_elem)(struct bpf_map *map, void *value); int (*map_peek_elem)(struct bpf_map *map, void *value); + void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, @@ -2184,6 +2185,7 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto; extern const struct bpf_func_proto bpf_map_push_elem_proto; extern const struct bpf_func_proto bpf_map_pop_elem_proto; extern const struct bpf_func_proto bpf_map_peek_elem_proto; +extern const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; -- cgit v1.2.3 From 43213daed6d6cb60e8cf69058a6db8648a556d9d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 May 2022 19:53:01 -0700 Subject: fortify: Provide a memcpy trap door for sharp corners As we continue to narrow the scope of what the FORTIFY memcpy() will accept and build alternative APIs that give the compiler appropriate visibility into more complex memcpy scenarios, there is a need for "unfortified" memcpy use in rare cases where combinations of compiler behaviors, source code layout, etc, result in cases where the stricter memcpy checks need to be bypassed until appropriate solutions can be developed (i.e. fix compiler bugs, code refactoring, new API, etc). The intention is for this to be used only if there's no other reasonable solution, for its use to include a justification that can be used to assess future solutions, and for it to be temporary. Example usage included, based on analysis and discussion from: https://lore.kernel.org/netdev/CANn89iLS_2cshtuXPyNUGDPaic=sJiYfvTb_wNLgWrZRyBxZ_g@mail.gmail.com Cc: Jakub Kicinski Cc: Eric Dumazet Cc: "David S. Miller" Cc: Paolo Abeni Cc: Coco Li Cc: Tariq Toukan Cc: Saeed Mahameed Cc: Leon Romanovsky Cc: netdev@vger.kernel.org Cc: linux-hardening@vger.kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220511025301.3636666-1-keescook@chromium.org Signed-off-by: Paolo Abeni --- include/linux/fortify-string.h | 16 ++++++++++++++++ include/linux/string.h | 4 ++++ 2 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 295637a66c46..3b401fa0f374 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -52,6 +52,22 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) #define __underlying_strncpy __builtin_strncpy #endif +/** + * unsafe_memcpy - memcpy implementation with no FORTIFY bounds checking + * + * @dst: Destination memory address to write to + * @src: Source memory address to read from + * @bytes: How many bytes to write to @dst from @src + * @justification: Free-form text or comment describing why the use is needed + * + * This should be used for corner cases where the compiler cannot do the + * right thing, or during transitions between APIs, etc. It should be used + * very rarely, and includes a place for justification detailing where bounds + * checking has happened, and why existing solutions cannot be employed. + */ +#define unsafe_memcpy(dst, src, bytes, justification) \ + __underlying_memcpy(dst, src, bytes) + /* * Clang's use of __builtin_object_size() within inlines needs hinting via * __pass_object_size(). The preference is to only ever use type 1 (member diff --git a/include/linux/string.h b/include/linux/string.h index b6572aeca2f5..61ec7e4f6311 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -252,6 +252,10 @@ static inline const char *kbasename(const char *path) #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) #include #endif +#ifndef unsafe_memcpy +#define unsafe_memcpy(dst, src, bytes, justification) \ + memcpy(dst, src, bytes) +#endif void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, int pad); -- cgit v1.2.3 From 0df65743537dd6e1c8b0924714f14dc367cba2be Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 11 May 2022 10:23:05 -0700 Subject: skbuff: replace a BUG_ON() with the new DEBUG_NET_WARN_ON_ONCE() Very few drivers actually have Kconfig knobs for adding -DDEBUG. 8 according to a quick grep, while there are 93 users of skb_checksum_none_assert(). Switch to the new DEBUG_NET_WARN_ON_ONCE() to catch bad skbs. Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220511172305.1382810-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b91d225fdc13..9d82a8b6c8f1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -5048,9 +5048,7 @@ static inline void skb_forward_csum(struct sk_buff *skb) */ static inline void skb_checksum_none_assert(const struct sk_buff *skb) { -#ifdef DEBUG - BUG_ON(skb->ip_summed != CHECKSUM_NONE); -#endif + DEBUG_NET_WARN_ON_ONCE(skb->ip_summed != CHECKSUM_NONE); } bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); -- cgit v1.2.3 From 16d1e00c7e8a4950e914223b3112144289a82913 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 9 May 2022 15:42:52 -0700 Subject: bpf: Add MEM_UNINIT as a bpf_type_flag Instead of having uninitialized versions of arguments as separate bpf_arg_types (eg ARG_PTR_TO_UNINIT_MEM as the uninitialized version of ARG_PTR_TO_MEM), we can instead use MEM_UNINIT as a bpf_type_flag modifier to denote that the argument is uninitialized. Doing so cleans up some of the logic in the verifier. We no longer need to do two checks against an argument type (eg "if (base_type(arg_type) == ARG_PTR_TO_MEM || base_type(arg_type) == ARG_PTR_TO_UNINIT_MEM)"), since uninitialized and initialized versions of the same argument type will now share the same base type. In the near future, MEM_UNINIT will be used by dynptr helper functions as well. Signed-off-by: Joanne Koong Acked-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/r/20220509224257.3222614-2-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5061ccd8b2dc..c107392b0ba7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -390,7 +390,10 @@ enum bpf_type_flag { */ PTR_UNTRUSTED = BIT(6 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = PTR_UNTRUSTED, + MEM_UNINIT = BIT(7 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_FLAG_MAX, + __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; /* Max number of base types. */ @@ -409,16 +412,11 @@ enum bpf_arg_type { ARG_CONST_MAP_PTR, /* const argument used as pointer to bpf_map */ ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ - ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ - /* the following constraints used to prototype bpf_memcmp() and other - * functions that access data on eBPF program stack + /* Used to prototype bpf_memcmp() and other functions that access data + * on eBPF program stack */ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ - ARG_PTR_TO_UNINIT_MEM, /* pointer to memory does not need to be initialized, - * helper function must fill all bytes or clear - * them in error case. - */ ARG_CONST_SIZE, /* number of bytes accessed from memory */ ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ @@ -450,6 +448,10 @@ enum bpf_arg_type { ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM, ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, ARG_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + /* pointer to memory does not need to be initialized, helper function must fill + * all bytes or clear them in error case. + */ + ARG_PTR_TO_UNINIT_MEM = MEM_UNINIT | ARG_PTR_TO_MEM, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. -- cgit v1.2.3 From 7c4e983c4f3cf94fcd879730c6caa877e0768a4d Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 13 May 2022 11:33:57 -0700 Subject: net: allow gso_max_size to exceed 65536 The code for gso_max_size was added originally to allow for debugging and workaround of buggy devices that couldn't support TSO with blocks 64K in size. The original reason for limiting it to 64K was because that was the existing limits of IPv4 and non-jumbogram IPv6 length fields. With the addition of Big TCP we can remove this limit and allow the value to potentially go up to UINT_MAX and instead be limited by the tso_max_size value. So in order to support this we need to go through and clean up the remaining users of the gso_max_size value so that the values will cap at 64K for non-TCPv6 flows. In addition we can clean up the GSO_MAX_SIZE value so that 64K becomes GSO_LEGACY_MAX_SIZE and UINT_MAX will now be the upper limit for GSO_MAX_SIZE. v6: (edumazet) fixed a compile error if CONFIG_IPV6=n, in a new sk_trim_gso_size() helper. netif_set_tso_max_size() caps the requested TSO size with GSO_MAX_SIZE. Signed-off-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 536321691c72..ce780e352f43 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2272,7 +2272,9 @@ struct net_device { const struct rtnl_link_ops *rtnl_link_ops; /* for setting kernel sock attribute on TCP connection setup */ -#define GSO_MAX_SIZE 65536 +#define GSO_LEGACY_MAX_SIZE 65536u +#define GSO_MAX_SIZE UINT_MAX + unsigned int gso_max_size; #define TSO_LEGACY_MAX_SIZE 65536 #define TSO_MAX_SIZE UINT_MAX -- cgit v1.2.3 From 34b92e8d19da1e44070dc0ecc2747871469ac534 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 May 2022 11:33:58 -0700 Subject: net: limit GSO_MAX_SIZE to 524280 bytes Make sure we will not overflow shinfo->gso_segs Minimal TCP MSS size is 8 bytes, and shinfo->gso_segs is a 16bit field. TCP_MIN_GSO_SIZE is currently defined in include/net/tcp.h, it seems cleaner to not bring tcp details into include/linux/netdevice.h Signed-off-by: Eric Dumazet Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ce780e352f43..fd38847d0dc7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2272,14 +2272,17 @@ struct net_device { const struct rtnl_link_ops *rtnl_link_ops; /* for setting kernel sock attribute on TCP connection setup */ +#define GSO_MAX_SEGS 65535u #define GSO_LEGACY_MAX_SIZE 65536u -#define GSO_MAX_SIZE UINT_MAX +/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), + * and shinfo->gso_segs is a 16bit field. + */ +#define GSO_MAX_SIZE (8 * GSO_MAX_SEGS) unsigned int gso_max_size; #define TSO_LEGACY_MAX_SIZE 65536 #define TSO_MAX_SIZE UINT_MAX unsigned int tso_max_size; -#define GSO_MAX_SEGS 65535 u16 gso_max_segs; #define TSO_MAX_SEGS U16_MAX u16 tso_max_segs; -- cgit v1.2.3 From 0fe79f28bfaf73b66b7b1562d2468f94aa03bd12 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 13 May 2022 11:34:03 -0700 Subject: net: allow gro_max_size to exceed 65536 Allow the gro_max_size to exceed a value larger than 65536. There weren't really any external limitations that prevented this other than the fact that IPv4 only supports a 16 bit length field. Since we have the option of adding a hop-by-hop header for IPv6 we can allow IPv6 to exceed this value and for IPv4 and non-TCP flows we can cap things at 65536 via a constant rather than relying on gro_max_size. [edumazet] limit GRO_MAX_SIZE to (8 * 65535) to avoid overflows. Signed-off-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fd38847d0dc7..d57ce248004c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2161,7 +2161,11 @@ struct net_device { struct bpf_prog __rcu *xdp_prog; unsigned long gro_flush_timeout; int napi_defer_hard_irqs; -#define GRO_MAX_SIZE 65536 +#define GRO_LEGACY_MAX_SIZE 65536u +/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), + * and shinfo->gso_segs is a 16bit field. + */ +#define GRO_MAX_SIZE (8 * 65535u) unsigned int gro_max_size; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; -- cgit v1.2.3 From 80e425b613421911f89664663a7060216abcaed2 Mon Sep 17 00:00:00 2001 From: Coco Li Date: Fri, 13 May 2022 11:34:04 -0700 Subject: ipv6: Add hop-by-hop header to jumbograms in ip6_output Instead of simply forcing a 0 payload_len in IPv6 header, implement RFC 2675 and insert a custom extension header. Note that only TCP stack is currently potentially generating jumbograms, and that this extension header is purely local, it wont be sent on a physical link. This is needed so that packet capture (tcpdump and friends) can properly dissect these large packets. Signed-off-by: Coco Li Signed-off-by: Eric Dumazet Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index ec5ca392eaa3..38c8203d52cb 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -145,6 +145,7 @@ struct inet6_skb_parm { #define IP6SKB_L3SLAVE 64 #define IP6SKB_JUMBOGRAM 128 #define IP6SKB_SEG6 256 +#define IP6SKB_FAKEJUMBO 512 }; #if defined(CONFIG_NET_L3_MASTER_DEV) -- cgit v1.2.3 From 7ebd3f3ee51a9e02994cd0a1be44fbd325d1e0dc Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 13 May 2022 11:03:38 +0800 Subject: net: skb: change the definition SKB_DR_SET() The SKB_DR_OR() is used to set the drop reason to a value when it is not set or specified yet. SKB_NOT_DROPPED_YET should also be considered as not set. Reviewed-by: Jiang Biao Reviewed-by: Hao Peng Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9d82a8b6c8f1..5a2b29df6cab 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -510,7 +510,8 @@ enum skb_drop_reason { (name = SKB_DROP_REASON_##reason) #define SKB_DR_OR(name, reason) \ do { \ - if (name == SKB_DROP_REASON_NOT_SPECIFIED) \ + if (name == SKB_DROP_REASON_NOT_SPECIFIED || \ + name == SKB_NOT_DROPPED_YET) \ SKB_DR_SET(name, reason); \ } while (0) -- cgit v1.2.3 From 97e719a82b43c6c2bb5eebdb3c5d479a332ac2ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 May 2022 21:24:53 -0700 Subject: net: fix possible race in skb_attempt_defer_free() A cpu can observe sd->defer_count reaching 128, and call smp_call_function_single_async() Problem is that the remote CPU can clear sd->defer_count before the IPI is run/acknowledged. Other cpus can queue more packets and also decide to call smp_call_function_single_async() while the pending IPI was not yet delivered. This is a common issue with smp_call_function_single_async(). Callers must ensure correct synchronization and serialization. I triggered this issue while experimenting smaller threshold. Performing the call to smp_call_function_single_async() under sd->defer_lock protection did not solve the problem. Commit 5a18ceca6350 ("smp: Allow smp_call_function_single_async() to insert locked csd") replaced an informative WARN_ON_ONCE() with a return of -EBUSY, which is often ignored. Test of CSD_FLAG_LOCK presence is racy anyway. Fixes: 68822bdf76f1 ("net: generalize skb freeing deferral to per-cpu lists") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d57ce248004c..cbaf312e365b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3136,6 +3136,7 @@ struct softnet_data { /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; int defer_count; + int defer_ipi_scheduled; struct sk_buff *defer_list; call_single_data_t defer_csd; }; -- cgit v1.2.3 From 89af2ce2d95c80f6da9388d9da2e35c84c2573b1 Mon Sep 17 00:00:00 2001 From: Ricardo Martinez Date: Fri, 13 May 2022 10:34:00 -0700 Subject: net: skb: Remove skb_data_area_size() skb_data_area_size() is not needed. As Jakub pointed out [1]: For Rx, drivers can use the size passed during skb allocation or use skb_tailroom(). For Tx, drivers should use skb_headlen(). [1] https://lore.kernel.org/netdev/CAHNKnsTmH-rGgWi3jtyC=ktM1DW2W1VJkYoTMJV2Z_Bt498bsg@mail.gmail.com/ Signed-off-by: Ricardo Martinez Reviewed-by: Andy Shevchenko Reviewed-by: Sergey Ryazanov Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5a2b29df6cab..da96f0d3e753 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1765,11 +1765,6 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) } #endif -static inline unsigned int skb_data_area_size(struct sk_buff *skb) -{ - return skb_end_pointer(skb) - skb->data; -} - struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg); -- cgit v1.2.3 From bec67592521ec816371f5f072b1a340e1c2ad434 Mon Sep 17 00:00:00 2001 From: Min Li Date: Mon, 16 May 2022 10:47:06 -0400 Subject: ptp: ptp_clockmatrix: Add PTP_CLK_REQ_EXTTS support Use TOD_READ_SECONDARY for extts to keep TOD_READ_PRIMARY for gettime and settime exclusively. Before this change, TOD_READ_PRIMARY was used for both extts and gettime/settime, which would result in changing TOD read/write triggers between operations. Using TOD_READ_SECONDARY would make extts independent of gettime/settime operation Signed-off-by: Min Li Acked-by: Richard Cochran Link: https://lore.kernel.org/r/1652712427-14703-1-git-send-email-min.li.xe@renesas.com Signed-off-by: Jakub Kicinski --- include/linux/mfd/idt8a340_reg.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/idt8a340_reg.h b/include/linux/mfd/idt8a340_reg.h index a18c1539a152..0c706085c205 100644 --- a/include/linux/mfd/idt8a340_reg.h +++ b/include/linux/mfd/idt8a340_reg.h @@ -407,7 +407,7 @@ #define TOD_READ_PRIMARY_0 0xcc40 #define TOD_READ_PRIMARY_0_V520 0xcc50 /* 8-bit subns, 32-bit ns, 48-bit seconds */ -#define TOD_READ_PRIMARY 0x0000 +#define TOD_READ_PRIMARY_BASE 0x0000 /* Counter increments after TOD write is completed */ #define TOD_READ_PRIMARY_COUNTER 0x000b /* Read trigger configuration */ @@ -424,6 +424,16 @@ #define TOD_READ_SECONDARY_0 0xcc90 #define TOD_READ_SECONDARY_0_V520 0xcca0 +/* 8-bit subns, 32-bit ns, 48-bit seconds */ +#define TOD_READ_SECONDARY_BASE 0x0000 +/* Counter increments after TOD write is completed */ +#define TOD_READ_SECONDARY_COUNTER 0x000b +/* Read trigger configuration */ +#define TOD_READ_SECONDARY_SEL_CFG_0 0x000c +/* Read trigger selection */ +#define TOD_READ_SECONDARY_CMD 0x000e +#define TOD_READ_SECONDARY_CMD_V520 0x000f + #define TOD_READ_SECONDARY_1 0xcca0 #define TOD_READ_SECONDARY_1_V520 0xccb0 #define TOD_READ_SECONDARY_2 0xccb0 -- cgit v1.2.3 From 1d2c717bc7f7fd3c9cf38d4a0d5d7ede06adf05b Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Fri, 13 May 2022 06:19:31 +0300 Subject: net/mlx5: Add last command failure syndrome to debugfs Add syndrome of last command failure per command type to debugfs to ease debugging of such failure. last_failed_syndrome - last command failed syndrome returned by FW. Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d6bac3976913..74c8cfb771a2 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -272,6 +272,8 @@ struct mlx5_cmd_stats { u32 last_failed_errno; /* last bad status returned by FW */ u8 last_failed_mbox_status; + /* last command failed syndrome returned by FW */ + u32 last_failed_syndrome; struct dentry *root; /* protect command average calculations */ spinlock_t lock; -- cgit v1.2.3 From 9b45bde82c229fda94618896ff530dcba9d66fe0 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 25 Jan 2022 14:47:36 +0200 Subject: net/mlx5: Inline db alloc API function Take the wrapper version which picks default node into a header file. This reduces the number of exported functions. Signed-off-by: Tariq Toukan Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 74c8cfb771a2..b064bc278f52 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1053,9 +1053,14 @@ int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in, int size_in, void *data_out, int size_out, u16 reg_num, int arg, int write); -int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db); int mlx5_db_alloc_node(struct mlx5_core_dev *dev, struct mlx5_db *db, int node); + +static inline int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db) +{ + return mlx5_db_alloc_node(dev, db, dev->priv.numa_node); +} + void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db); const char *mlx5_command_str(int command); -- cgit v1.2.3 From 94db3317781922ba52722c58061e0e8517d4d80d Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 31 Jan 2022 07:49:51 +0200 Subject: net/mlx5: Support multiport eswitch mode Multiport eswitch mode is a LAG mode that allows to add rules that forward traffic to a specific physical port without being affected by LAG affinity configuration. This mode of operation is mutual exclusive with the other LAG modes used by multipath and bonding. To make the transition between the modes, we maintain a counter on the number of rules specifying one of the uplink representors as the target of mirred egress redirect action. An example of such rule would be: $ tc filter add dev enp8s0f0_0 prot all root flower dst_mac \ 00:11:22:33:44:55 action mirred egress redirect dev enp8s0f0 If the reference count just grows to one and LAG is not in use, we create the LAG in multiport eswitch mode. Other mode changes are not allowed while in this mode. When the reference count reaches zero, we destroy the LAG and let other modes be used if needed. logic also changed such that if forwarding to some uplink destination cannot be guaranteed, we fail the operation so the rule will eventually be in software and not in hardware. Signed-off-by: Eli Cohen Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 7bab3e51c61e..78b3d3465dd7 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1359,7 +1359,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 vhca_resource_manager[0x1]; u8 hca_cap_2[0x1]; - u8 reserved_at_21[0x1]; + u8 create_lag_when_not_master_up[0x1]; u8 dtor[0x1]; u8 event_on_vhca_state_teardown_request[0x1]; u8 event_on_vhca_state_in_use[0x1]; @@ -10816,7 +10816,8 @@ struct mlx5_ifc_dcbx_param_bits { enum { MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY = 0, - MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT, + MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT = 1, + MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW = 2, }; struct mlx5_ifc_lagc_bits { -- cgit v1.2.3 From 6c1e423a3c84953edcf91ff03ab97829b287184a Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Wed, 18 May 2022 17:45:27 +0200 Subject: can: can-dev: remove obsolete CAN LED support Since commit 30f3b42147ba6f ("can: mark led trigger as broken") the CAN specific LED support was disabled and marked as BROKEN. As the common LED support with CONFIG_LEDS_TRIGGER_NETDEV should do this work now the code can be removed as preparation for a CAN netdevice Kconfig rework. Link: https://lore.kernel.org/all/20220518154527.29046-1-socketcan@hartkopp.net Suggested-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp [mkl: remove led.h from MAINTAINERS] Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 10 ---------- include/linux/can/led.h | 51 ------------------------------------------------- 2 files changed, 61 deletions(-) delete mode 100644 include/linux/can/led.h (limited to 'include/linux') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index c2ea47f30046..e22dc03c850e 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -85,15 +84,6 @@ struct can_priv { int (*do_get_berr_counter)(const struct net_device *dev, struct can_berr_counter *bec); int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv); - -#ifdef CONFIG_CAN_LEDS - struct led_trigger *tx_led_trig; - char tx_led_trig_name[CAN_LED_NAME_SZ]; - struct led_trigger *rx_led_trig; - char rx_led_trig_name[CAN_LED_NAME_SZ]; - struct led_trigger *rxtx_led_trig; - char rxtx_led_trig_name[CAN_LED_NAME_SZ]; -#endif }; static inline bool can_tdc_is_enabled(const struct can_priv *priv) diff --git a/include/linux/can/led.h b/include/linux/can/led.h deleted file mode 100644 index 7c3cfd798c56..000000000000 --- a/include/linux/can/led.h +++ /dev/null @@ -1,51 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2012, Fabio Baltieri - */ - -#ifndef _CAN_LED_H -#define _CAN_LED_H - -#include -#include -#include - -enum can_led_event { - CAN_LED_EVENT_OPEN, - CAN_LED_EVENT_STOP, - CAN_LED_EVENT_TX, - CAN_LED_EVENT_RX, -}; - -#ifdef CONFIG_CAN_LEDS - -/* keep space for interface name + "-tx"/"-rx"/"-rxtx" - * suffix and null terminator - */ -#define CAN_LED_NAME_SZ (IFNAMSIZ + 6) - -void can_led_event(struct net_device *netdev, enum can_led_event event); -void devm_can_led_init(struct net_device *netdev); -int __init can_led_notifier_init(void); -void __exit can_led_notifier_exit(void); - -#else - -static inline void can_led_event(struct net_device *netdev, - enum can_led_event event) -{ -} -static inline void devm_can_led_init(struct net_device *netdev) -{ -} -static inline int can_led_notifier_init(void) -{ - return 0; -} -static inline void can_led_notifier_exit(void) -{ -} - -#endif - -#endif /* !_CAN_LED_H */ -- cgit v1.2.3 From 5cebb40bc9554aafcc492431181f43c6231b0459 Mon Sep 17 00:00:00 2001 From: Harini Katakam Date: Wed, 18 May 2022 22:37:56 +0530 Subject: net: macb: Fix PTP one step sync support PTP one step sync packets cannot have CSUM padding and insertion in SW since time stamp is inserted on the fly by HW. In addition, ptp4l version 3.0 and above report an error when skb timestamps are reported for packets that not processed for TX TS after transmission. Add a helper to identify PTP one step sync and fix the above two errors. Add a common mask for PTP header flag field "twoStepflag". Also reset ptp OSS bit when one step is not selected. Fixes: ab91f0a9b5f4 ("net: macb: Add hardware PTP support") Fixes: 653e92a9175e ("net: macb: add support for padding and fcs computation") Signed-off-by: Harini Katakam Reviewed-by: Radhey Shyam Pandey Reviewed-by: Claudiu Beznea Link: https://lore.kernel.org/r/20220518170756.7752-1-harini.katakam@xilinx.com Signed-off-by: Jakub Kicinski --- include/linux/ptp_classify.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h index fefa7790dc46..2b6ea36ad162 100644 --- a/include/linux/ptp_classify.h +++ b/include/linux/ptp_classify.h @@ -43,6 +43,9 @@ #define OFF_PTP_SOURCE_UUID 22 /* PTPv1 only */ #define OFF_PTP_SEQUENCE_ID 30 +/* PTP header flag fields */ +#define PTP_FLAG_TWOSTEP BIT(1) + /* Below defines should actually be removed at some point in time. */ #define IP6_HLEN 40 #define UDP_HLEN 8 -- cgit v1.2.3 From 3bc253c2e652cf5f12cd8c00d80d8ec55d67d1a7 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 19 May 2022 16:30:10 -0700 Subject: bpf: Add bpf_skc_to_mptcp_sock_proto This patch implements a new struct bpf_func_proto, named bpf_skc_to_mptcp_sock_proto. Define a new bpf_id BTF_SOCK_TYPE_MPTCP, and a new helper bpf_skc_to_mptcp_sock(), which invokes another new helper bpf_mptcp_sock_from_subflow() in net/mptcp/bpf.c to get struct mptcp_sock from a given subflow socket. v2: Emit BTF type, add func_id checks in verifier.c and bpf_trace.c, remove build check for CONFIG_BPF_JIT v5: Drop EXPORT_SYMBOL (Martin) Co-developed-by: Nicolas Rybowski Co-developed-by: Matthieu Baerts Signed-off-by: Nicolas Rybowski Signed-off-by: Matthieu Baerts Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220519233016.105670-2-mathew.j.martineau@linux.intel.com --- include/linux/bpf.h | 1 + include/linux/btf_ids.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c107392b0ba7..a3ef078401cf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2231,6 +2231,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_skc_to_unix_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; extern const struct bpf_func_proto bpf_snprintf_proto; diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index bc5d9cc34e4c..335a19092368 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -178,7 +178,8 @@ extern struct btf_id_set name; BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, tcp6_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock) + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_MPTCP, mptcp_sock) enum { #define BTF_SOCK_TYPE(name, str) name, -- cgit v1.2.3 From b23316aabffa835ecc516cb81daeef5b9155e8a5 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Thu, 19 May 2022 23:06:10 +0800 Subject: selftests/bpf: Add missing trampoline program type to trampoline_count test Currently the trampoline_count test doesn't include any fmod_ret bpf programs, fix it to make the test cover all possible trampoline program types. Since fmod_ret bpf programs can't be attached to __set_task_comm function, as it's neither whitelisted for error injection nor a security hook, change it to bpf_modify_return_test. This patch also does some other cleanups such as removing duplicate code, dropping inconsistent comments, etc. Signed-off-by: Yuntao Wang Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220519150610.601313-1-ytcoode@gmail.com --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a3ef078401cf..cc4d5e394031 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -724,7 +724,7 @@ struct btf_func_model { #define BPF_TRAMP_F_RET_FENTRY_RET BIT(4) /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 - * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 + * bytes on x86. */ #define BPF_MAX_TRAMP_LINKS 38 -- cgit v1.2.3 From cc4e7fa549cb69c18bc2ba7209df373ca06749e3 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 21 May 2022 13:10:53 +0200 Subject: net: qed: fix typos in comments Spelling mistakes (triple letters) in comments. Detected with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- include/linux/qed/qed_fcoe_if.h | 4 ++-- include/linux/qed/qed_iscsi_if.h | 4 ++-- include/linux/qed/qed_nvmetcp_if.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/qed/qed_fcoe_if.h b/include/linux/qed/qed_fcoe_if.h index 16752eca5cbd..90e3045b2dcb 100644 --- a/include/linux/qed/qed_fcoe_if.h +++ b/include/linux/qed/qed_fcoe_if.h @@ -76,7 +76,7 @@ void qed_fcoe_set_pf_params(struct qed_dev *cdev, * @fill_dev_info: fills FCoE specific information * @param cdev * @param info - * @return 0 on sucesss, otherwise error value. + * @return 0 on success, otherwise error value. * @register_ops: register FCoE operations * @param cdev * @param ops - specified using qed_iscsi_cb_ops @@ -96,7 +96,7 @@ void qed_fcoe_set_pf_params(struct qed_dev *cdev, * connection. * @param p_doorbell - qed will fill the address of the * doorbell. - * return 0 on sucesss, otherwise error value. + * return 0 on success, otherwise error value. * @release_conn: release a previously acquired fcoe connection * @param cdev * @param handle - the connection handle. diff --git a/include/linux/qed/qed_iscsi_if.h b/include/linux/qed/qed_iscsi_if.h index 494cdc3cd840..fbf7973ae9ba 100644 --- a/include/linux/qed/qed_iscsi_if.h +++ b/include/linux/qed/qed_iscsi_if.h @@ -133,7 +133,7 @@ struct qed_iscsi_cb_ops { * @fill_dev_info: fills iSCSI specific information * @param cdev * @param info - * @return 0 on sucesss, otherwise error value. + * @return 0 on success, otherwise error value. * @register_ops: register iscsi operations * @param cdev * @param ops - specified using qed_iscsi_cb_ops @@ -152,7 +152,7 @@ struct qed_iscsi_cb_ops { * connection. * @param p_doorbell - qed will fill the address of the * doorbell. - * @return 0 on sucesss, otherwise error value. + * @return 0 on success, otherwise error value. * @release_conn: release a previously acquired iscsi connection * @param cdev * @param handle - the connection handle. diff --git a/include/linux/qed/qed_nvmetcp_if.h b/include/linux/qed/qed_nvmetcp_if.h index 1d51df347560..bbfbfba51f37 100644 --- a/include/linux/qed/qed_nvmetcp_if.h +++ b/include/linux/qed/qed_nvmetcp_if.h @@ -132,7 +132,7 @@ struct nvmetcp_task_params { * connection. * @param p_doorbell - qed will fill the address of the * doorbell. - * @return 0 on sucesss, otherwise error value. + * @return 0 on success, otherwise error value. * @release_conn: release a previously acquired nvmetcp connection * @param cdev * @param handle - the connection handle. -- cgit v1.2.3 From ad25f5cb39872ca14bcbe00816ae65c22fe04b89 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 21 May 2022 08:45:28 +0100 Subject: rxrpc: Fix locking issue There's a locking issue with the per-netns list of calls in rxrpc. The pieces of code that add and remove a call from the list use write_lock() and the calls procfile uses read_lock() to access it. However, the timer callback function may trigger a removal by trying to queue a call for processing and finding that it's already queued - at which point it has a spare refcount that it has to do something with. Unfortunately, if it puts the call and this reduces the refcount to 0, the call will be removed from the list. Unfortunately, since the _bh variants of the locking functions aren't used, this can deadlock. ================================ WARNING: inconsistent lock state 5.18.0-rc3-build4+ #10 Not tainted -------------------------------- inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. ksoftirqd/2/25 [HC0[0]:SC1[1]:HE1:SE0] takes: ffff888107ac4038 (&rxnet->call_lock){+.?.}-{2:2}, at: rxrpc_put_call+0x103/0x14b {SOFTIRQ-ON-W} state was registered at: ... Possible unsafe locking scenario: CPU0 ---- lock(&rxnet->call_lock); lock(&rxnet->call_lock); *** DEADLOCK *** 1 lock held by ksoftirqd/2/25: #0: ffff8881008ffdb0 ((&call->timer)){+.-.}-{0:0}, at: call_timer_fn+0x5/0x23d Changes ======= ver #2) - Changed to using list_next_rcu() rather than rcu_dereference() directly. Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: David S. Miller --- include/linux/list.h | 10 ++++++++++ include/linux/seq_file.h | 4 ++++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index c147eeb2d39d..57e8b559cdf6 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -605,6 +605,16 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each(pos, head) \ for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) +/** + * list_for_each_rcu - Iterate over a list in an RCU-safe fashion + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + */ +#define list_for_each_rcu(pos, head) \ + for (pos = rcu_dereference((head)->next); \ + !list_is_head(pos, (head)); \ + pos = rcu_dereference(pos->next)) + /** * list_for_each_continue - continue iteration over a list * @pos: the &struct list_head to use as a loop cursor. diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 60820ab511d2..bd023dd38ae6 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -277,6 +277,10 @@ extern struct list_head *seq_list_start_head(struct list_head *head, extern struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos); +extern struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos); +extern struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos); +extern struct list_head *seq_list_next_rcu(void *v, struct list_head *head, loff_t *ppos); + /* * Helpers for iteration over hlist_head-s in seq_files */ -- cgit v1.2.3 From c304eddcecfe2513ff98ce3ae97d1c196d82ba08 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 19 May 2022 13:20:54 -0700 Subject: net: wrap the wireless pointers in struct net_device in an ifdef Most protocol-specific pointers in struct net_device are under a respective ifdef. Wireless is the notable exception. Since there's a sizable number of custom-built kernels for datacenter workloads which don't build wireless it seems reasonable to ifdefy those pointers as well. While at it move IPv4 and IPv6 pointers up, those are special for obvious reasons. Acked-by: Johannes Berg Acked-by: Stefan Schmidt # ieee802154 Acked-by: Sven Eckelmann Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1ce91cb8074a..f615a66c89e9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2119,6 +2119,8 @@ struct net_device { /* Protocol-specific pointers */ + struct in_device __rcu *ip_ptr; + struct inet6_dev __rcu *ip6_ptr; #if IS_ENABLED(CONFIG_VLAN_8021Q) struct vlan_info __rcu *vlan_info; #endif @@ -2131,16 +2133,18 @@ struct net_device { #if IS_ENABLED(CONFIG_ATALK) void *atalk_ptr; #endif - struct in_device __rcu *ip_ptr; #if IS_ENABLED(CONFIG_DECNET) struct dn_dev __rcu *dn_ptr; #endif - struct inet6_dev __rcu *ip6_ptr; #if IS_ENABLED(CONFIG_AX25) void *ax25_ptr; #endif +#if IS_ENABLED(CONFIG_CFG80211) struct wireless_dev *ieee80211_ptr; +#endif +#if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN) struct wpan_dev *ieee802154_ptr; +#endif #if IS_ENABLED(CONFIG_MPLS_ROUTING) struct mpls_dev __rcu *mpls_ptr; #endif -- cgit v1.2.3 From fe736565efb775620dbcf3c459c1cd80d3e868da Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 20 May 2022 16:57:53 -0700 Subject: bpf: Introduce bpf_arch_text_invalidate for bpf_prog_pack Introduce bpf_arch_text_invalidate and use it to fill unused part of the bpf_prog_pack with illegal instructions when a BPF program is freed. Fixes: 57631054fae6 ("bpf: Introduce bpf_prog_pack allocator") Fixes: 33c9805860e5 ("bpf: Introduce bpf_jit_binary_pack_[alloc|finalize|free]") Reported-by: Linus Torvalds Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220520235758.1858153-4-song@kernel.org --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cc4d5e394031..a9b1875212f6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2365,6 +2365,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); void *bpf_arch_text_copy(void *dst, void *src, size_t len); +int bpf_arch_text_invalidate(void *dst, size_t len); struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); -- cgit v1.2.3 From 97e03f521050c092919591e668107b3d69c5f426 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 23 May 2022 14:07:07 -0700 Subject: bpf: Add verifier support for dynptrs This patch adds the bulk of the verifier work for supporting dynamic pointers (dynptrs) in bpf. A bpf_dynptr is opaque to the bpf program. It is a 16-byte structure defined internally as: struct bpf_dynptr_kern { void *data; u32 size; u32 offset; } __aligned(8); The upper 8 bits of *size* is reserved (it contains extra metadata about read-only status and dynptr type). Consequently, a dynptr only supports memory less than 16 MB. There are different types of dynptrs (eg malloc, ringbuf, ...). In this patchset, the most basic one, dynptrs to a bpf program's local memory, is added. For now only local memory that is of reg type PTR_TO_MAP_VALUE is supported. In the verifier, dynptr state information will be tracked in stack slots. When the program passes in an uninitialized dynptr (ARG_PTR_TO_DYNPTR | MEM_UNINIT), the stack slots corresponding to the frame pointer where the dynptr resides at are marked STACK_DYNPTR. For helper functions that take in initialized dynptrs (eg bpf_dynptr_read + bpf_dynptr_write which are added later in this patchset), the verifier enforces that the dynptr has been initialized properly by checking that their corresponding stack slots have been marked as STACK_DYNPTR. The 6th patch in this patchset adds test cases that the verifier should successfully reject, such as for example attempting to use a dynptr after doing a direct write into it inside the bpf program. Signed-off-by: Joanne Koong Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20220523210712.3641569-2-joannelkoong@gmail.com --- include/linux/bpf.h | 28 ++++++++++++++++++++++++++++ include/linux/bpf_verifier.h | 18 ++++++++++++++++++ 2 files changed, 46 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a9b1875212f6..b26c8176b9e0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -392,10 +392,15 @@ enum bpf_type_flag { MEM_UNINIT = BIT(7 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to memory local to the bpf program. */ + DYNPTR_TYPE_LOCAL = BIT(8 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; +#define DYNPTR_TYPE_FLAG_MASK DYNPTR_TYPE_LOCAL + /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -438,6 +443,7 @@ enum bpf_arg_type { ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ ARG_PTR_TO_KPTR, /* pointer to referenced kptr */ + ARG_PTR_TO_DYNPTR, /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */ __BPF_ARG_TYPE_MAX, /* Extended arg_types. */ @@ -2376,4 +2382,26 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, u32 **bin_buf, u32 num_args); void bpf_bprintf_cleanup(void); +/* the implementation of the opaque uapi struct bpf_dynptr */ +struct bpf_dynptr_kern { + void *data; + /* Size represents the number of usable bytes of dynptr data. + * If for example the offset is at 4 for a local dynptr whose data is + * of type u64, the number of usable bytes is 4. + * + * The upper 8 bits are reserved. It is as follows: + * Bits 0 - 23 = size + * Bits 24 - 30 = dynptr type + * Bit 31 = whether dynptr is read-only + */ + u32 size; + u32 offset; +} __aligned(8); + +enum bpf_dynptr_type { + BPF_DYNPTR_TYPE_INVALID, + /* Points to memory that is local to the bpf program */ + BPF_DYNPTR_TYPE_LOCAL, +}; + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1f1e7f2ea967..af5b2135215e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -72,6 +72,18 @@ struct bpf_reg_state { u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ + /* For dynptr stack slots */ + struct { + enum bpf_dynptr_type type; + /* A dynptr is 16 bytes so it takes up 2 stack slots. + * We need to track which slot is the first slot + * to protect against cases where the user may try to + * pass in an address starting at the second slot of the + * dynptr. + */ + bool first_slot; + } dynptr; + /* Max size from any of the above. */ struct { unsigned long raw1; @@ -174,9 +186,15 @@ enum bpf_stack_slot_type { STACK_SPILL, /* register spilled into stack */ STACK_MISC, /* BPF program wrote some data into this slot */ STACK_ZERO, /* BPF program wrote constant zero */ + /* A dynptr is stored in this stack slot. The type of dynptr + * is stored in bpf_stack_state->spilled_ptr.dynptr.type + */ + STACK_DYNPTR, }; #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ +#define BPF_DYNPTR_SIZE sizeof(struct bpf_dynptr_kern) +#define BPF_DYNPTR_NR_SLOTS (BPF_DYNPTR_SIZE / BPF_REG_SIZE) struct bpf_stack_state { struct bpf_reg_state spilled_ptr; -- cgit v1.2.3 From bc34dee65a65e9c920c420005b8a43f2a721a458 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 23 May 2022 14:07:09 -0700 Subject: bpf: Dynptr support for ring buffers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, our only way of writing dynamically-sized data into a ring buffer is through bpf_ringbuf_output but this incurs an extra memcpy cost. bpf_ringbuf_reserve + bpf_ringbuf_commit avoids this extra memcpy, but it can only safely support reservation sizes that are statically known since the verifier cannot guarantee that the bpf program won’t access memory outside the reserved space. The bpf_dynptr abstraction allows for dynamically-sized ring buffer reservations without the extra memcpy. There are 3 new APIs: long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr); void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags); void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags); These closely follow the functionalities of the original ringbuf APIs. For example, all ringbuffer dynptrs that have been reserved must be either submitted or discarded before the program exits. Signed-off-by: Joanne Koong Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20220523210712.3641569-4-joannelkoong@gmail.com --- include/linux/bpf.h | 15 ++++++++++++++- include/linux/bpf_verifier.h | 2 ++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b26c8176b9e0..c72321b6f306 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -395,11 +395,14 @@ enum bpf_type_flag { /* DYNPTR points to memory local to the bpf program. */ DYNPTR_TYPE_LOCAL = BIT(8 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to a ringbuf record. */ + DYNPTR_TYPE_RINGBUF = BIT(9 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; -#define DYNPTR_TYPE_FLAG_MASK DYNPTR_TYPE_LOCAL +#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -2231,6 +2234,9 @@ extern const struct bpf_func_proto bpf_ringbuf_reserve_proto; extern const struct bpf_func_proto bpf_ringbuf_submit_proto; extern const struct bpf_func_proto bpf_ringbuf_discard_proto; extern const struct bpf_func_proto bpf_ringbuf_query_proto; +extern const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto; +extern const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto; +extern const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto; extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; @@ -2402,6 +2408,13 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_INVALID, /* Points to memory that is local to the bpf program */ BPF_DYNPTR_TYPE_LOCAL, + /* Underlying data is a ringbuf record */ + BPF_DYNPTR_TYPE_RINGBUF, }; +void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, + enum bpf_dynptr_type type, u32 offset, u32 size); +void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); +int bpf_dynptr_check_size(u32 size); + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index af5b2135215e..e8439f6cbe57 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -100,6 +100,8 @@ struct bpf_reg_state { * for the purpose of tracking that it's freed. * For PTR_TO_SOCKET this is used to share which pointers retain the * same reference to the socket, to determine proper reference freeing. + * For stack slots that are dynptrs, this is used to track references to + * the dynptr to determine proper reference freeing. */ u32 id; /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned -- cgit v1.2.3 From 34d4ef5775f776ec4b0d53a02d588bf3195cada6 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 23 May 2022 14:07:11 -0700 Subject: bpf: Add dynptr data slices This patch adds a new helper function void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len); which returns a pointer to the underlying data of a dynptr. *len* must be a statically known value. The bpf program may access the returned data slice as a normal buffer (eg can do direct reads and writes), since the verifier associates the length with the returned pointer, and enforces that no out of bounds accesses occur. Signed-off-by: Joanne Koong Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220523210712.3641569-6-joannelkoong@gmail.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c72321b6f306..a7080c86fa76 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -488,6 +488,7 @@ enum bpf_return_type { RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_ALLOC_MEM, + RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is -- cgit v1.2.3