From be2644aac3e1db02d09f45d56206bbdafca582a2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 1 Oct 2019 10:49:06 -0700 Subject: tcp: add ipv6_addr_v4mapped_loopback() helper tcp_twsk_unique() has a hard coded assumption about ipv4 loopback being 127/8 Lets instead use the standard ipv4_is_loopback() method, in a new ipv6_addr_v4mapped_loopback() helper. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ipv6.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 009605c56f20..d04b7abe2a4c 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -696,6 +696,11 @@ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a) cpu_to_be32(0x0000ffff))) == 0UL; } +static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a) +{ + return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]); +} + static inline u32 ipv6_portaddr_hash(const struct net *net, const struct in6_addr *addr6, unsigned int port) -- cgit v1.2.3 From ff92741270bf8b6e78aa885f166b68c7a67ab13a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 11:48:15 +0200 Subject: net: introduce name_node struct to be used in hashlist Introduce name_node structure to hold name of device and put it into hashlist instead of putting there struct net_device directly. Add a necessary infrastructure to manipulate the hashlist. This prepares the code to use the same hashlist for alternative names introduced later in this set. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9eda1c31d1f7..e92bc5467256 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -925,6 +925,12 @@ struct dev_ifalias { struct devlink; struct tlsdev_ops; +struct netdev_name_node { + struct hlist_node hlist; + struct net_device *dev; + const char *name; +}; + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -1564,7 +1570,7 @@ enum netdev_priv_flags { * (i.e. as seen by users in the "Space.c" file). It is the name * of the interface. * - * @name_hlist: Device name hash chain, please keep it close to name[] + * @name_node: Name hashlist node * @ifalias: SNMP alias * @mem_end: Shared memory end * @mem_start: Shared memory start @@ -1774,7 +1780,7 @@ enum netdev_priv_flags { struct net_device { char name[IFNAMSIZ]; - struct hlist_node name_hlist; + struct netdev_name_node *name_node; struct dev_ifalias __rcu *ifalias; /* * I/O specific fields -- cgit v1.2.3 From 36fbf1e52bd3ff8a5cb604955eedfc9350c2e6cc Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 11:48:16 +0200 Subject: net: rtnetlink: add linkprop commands to add and delete alternative ifnames Add two commands to add and delete list of link properties. Implement the first property type along - alternative ifnames. Each net device can have multiple alternative names. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++++ include/uapi/linux/if.h | 1 + include/uapi/linux/if_link.h | 2 ++ include/uapi/linux/rtnetlink.h | 7 +++++++ 4 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e92bc5467256..48cc71aae466 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -927,10 +927,14 @@ struct tlsdev_ops; struct netdev_name_node { struct hlist_node hlist; + struct list_head list; struct net_device *dev; const char *name; }; +int netdev_name_node_alt_create(struct net_device *dev, const char *name); +int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h index 7fea0fd7d6f5..4bf33344aab1 100644 --- a/include/uapi/linux/if.h +++ b/include/uapi/linux/if.h @@ -33,6 +33,7 @@ #define IFNAMSIZ 16 #endif /* __UAPI_DEF_IF_IFNAMSIZ */ #define IFALIASZ 256 +#define ALTIFNAMSIZ 128 #include /* For glibc compatibility. An empty enum does not compile. */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 4a8c02cafa9a..8aec8769d944 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -167,6 +167,8 @@ enum { IFLA_NEW_IFINDEX, IFLA_MIN_MTU, IFLA_MAX_MTU, + IFLA_PROP_LIST, + IFLA_ALT_IFNAME, /* Alternative ifname */ __IFLA_MAX }; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index ce2a623abb75..1418a8362bb7 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -164,6 +164,13 @@ enum { RTM_GETNEXTHOP, #define RTM_GETNEXTHOP RTM_GETNEXTHOP + RTM_NEWLINKPROP = 108, +#define RTM_NEWLINKPROP RTM_NEWLINKPROP + RTM_DELLINKPROP, +#define RTM_DELLINKPROP RTM_DELLINKPROP + RTM_GETLINKPROP, +#define RTM_GETLINKPROP RTM_GETLINKPROP + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; -- cgit v1.2.3 From afa0df5998131153ec3036f41e76ece33bf1334f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 10:15:09 +0200 Subject: net: push loops and nb calls into helper functions Push iterations over net namespaces and netdevices from register_netdevice_notifier() and unregister_netdevice_notifier() into helper functions. Along with that introduce continue_reverse macros to make the code a bit nicer allowing to get rid of "last" marks. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ include/net/net_namespace.h | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 48cc71aae466..7b183f724fc4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2574,6 +2574,9 @@ extern rwlock_t dev_base_lock; /* Device list lock */ list_for_each_entry_safe(d, n, &(net)->dev_base_head, dev_list) #define for_each_netdev_continue(net, d) \ list_for_each_entry_continue(d, &(net)->dev_base_head, dev_list) +#define for_each_netdev_continue_reverse(net, d) \ + list_for_each_entry_continue_reverse(d, &(net)->dev_base_head, \ + dev_list) #define for_each_netdev_continue_rcu(net, d) \ list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_in_bond_rcu(bond, slave) \ diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index f8712bbeb2e0..c5a98e03591d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -317,7 +317,8 @@ static inline struct net *read_pnet(const possible_net_t *pnet) /* Protected by net_rwsem */ #define for_each_net(VAR) \ list_for_each_entry(VAR, &net_namespace_list, list) - +#define for_each_net_continue_reverse(VAR) \ + list_for_each_entry_continue_reverse(VAR, &net_namespace_list, list) #define for_each_net_rcu(VAR) \ list_for_each_entry_rcu(VAR, &net_namespace_list, list) -- cgit v1.2.3 From a30c7b429f2dd980202c912fcb76442364937b4d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 10:15:10 +0200 Subject: net: introduce per-netns netdevice notifiers Often the code for example in drivers is interested in getting notifier call only from certain network namespace. In addition to the existing global netdevice notifier chain introduce per-netns chains and allow users to register to that. Eventually this would eliminate unnecessary overhead in case there are many netdevices in many network namespaces. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ include/net/net_namespace.h | 3 +++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7b183f724fc4..fe45b2c72315 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2504,6 +2504,9 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd); int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); +int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb); +int unregister_netdevice_notifier_net(struct net *net, + struct notifier_block *nb); struct netdev_notifier_info { struct net_device *dev; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index c5a98e03591d..5ac2bb16d4b3 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -36,6 +36,7 @@ #include #include #include +#include struct user_namespace; struct proc_dir_entry; @@ -96,6 +97,8 @@ struct net { struct list_head dev_base_head; struct hlist_head *dev_name_head; struct hlist_head *dev_index_head; + struct raw_notifier_head netdev_chain; + unsigned int dev_base_seq; /* protected by rtnl_mutex */ int ifindex; unsigned int dev_unreg_count; -- cgit v1.2.3 From 37048e94a2dc81a5a259963117f62341e25161f7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 1 Oct 2019 22:12:50 +0300 Subject: net: dsa: Remove unused __DSA_SKB_CB macro The struct __dsa_skb_cb is supposed to span the entire 48-byte skb control block, while the struct dsa_skb_cb only the portion of it which is used by the DSA core (the rest is available as private data to drivers). The DSA_SKB_CB and __DSA_SKB_CB helpers are supposed to help retrieve this pointer based on a skb, but it turns out there is nobody directly interested in the struct __dsa_skb_cb in the kernel. So remove it. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 541fb514e31d..8c3ea0530f65 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -94,8 +94,6 @@ struct __dsa_skb_cb { u8 priv[48 - sizeof(struct dsa_skb_cb)]; }; -#define __DSA_SKB_CB(skb) ((struct __dsa_skb_cb *)((skb)->cb)) - #define DSA_SKB_CB(skb) ((struct dsa_skb_cb *)((skb)->cb)) #define DSA_SKB_CB_PRIV(skb) \ -- cgit v1.2.3 From 968a2978cb39a754750d35a47049781660682a31 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 2 Oct 2019 16:52:57 +0200 Subject: net: stmmac: Only enable enhanced addressing mode when needed Enhanced addressing mode is only required when more than 32 bits need to be addressed. Add a DMA configuration parameter to enable this mode only when needed. Signed-off-by: Thierry Reding Signed-off-by: David S. Miller --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index dc60d03c4b60..86f9464c3f5d 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -92,6 +92,7 @@ struct stmmac_dma_cfg { int fixed_burst; int mixed_burst; bool aal; + bool eame; }; #define AXI_BLEN 7 -- cgit v1.2.3 From 4b76f9ed47074990d85c2ec52288a7d09c3ea357 Mon Sep 17 00:00:00 2001 From: Sunil Dutt Date: Fri, 13 Sep 2019 18:11:44 +0530 Subject: nl80211: Document the expectation for NL80211_ATTR_IE in NL80211_CMD_CONNECT This commit documents the expectation for NL80211_ATTR_IE when included in NL80211_CMD_CONNECT, as following. Driver shall not modify the IEs specified through NL80211_ATTR_IE if NL80211_ATTR_MAC is included. However, if NL80211_ATTR_MAC_HINT is included, these IEs through NL80211_ATTR_IE are specified by the user space based on the best possible BSS selected. Thus, if the driver ends up selecting a different BSS, it can modify these IEs accordingly (e.g. userspace asks the driver to perform PMKSA caching with BSS1 and the driver ends up selecting BSS2 with different PMKSA cache entry. RSNIE has to get updated with the apt PMKID). Signed-off-by: Sunil Dutt Link: https://lore.kernel.org/r/1568378504-15179-1-git-send-email-usdutt@codeaurora.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index beee59c831a7..64135ab3a7ac 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -571,6 +571,14 @@ * set of BSSID,frequency parameters is used (i.e., either the enforcing * %NL80211_ATTR_MAC,%NL80211_ATTR_WIPHY_FREQ or the less strict * %NL80211_ATTR_MAC_HINT and %NL80211_ATTR_WIPHY_FREQ_HINT). + * Driver shall not modify the IEs specified through %NL80211_ATTR_IE if + * %NL80211_ATTR_MAC is included. However, if %NL80211_ATTR_MAC_HINT is + * included, these IEs through %NL80211_ATTR_IE are specified by the user + * space based on the best possible BSS selected. Thus, if the driver ends + * up selecting a different BSS, it can modify these IEs accordingly (e.g. + * userspace asks the driver to perform PMKSA caching with BSS1 and the + * driver ends up selecting BSS2 with different PMKSA cache entry; RSNIE + * has to get updated with the apt PMKID). * %NL80211_ATTR_PREV_BSSID can be used to request a reassociation within * the ESS in case the device is already associated and an association with * a different BSS is desired. -- cgit v1.2.3 From 2ce113de31320756b25179f3f4512a522bc45263 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Oct 2019 11:12:25 +0200 Subject: mac80211: simplify TX aggregation start There really is no need to make drivers call the ieee80211_start_tx_ba_cb_irqsafe() function and then schedule the worker if all we want is to set a bit. Add a new return value (that was previously considered invalid) to indicate that the driver is immediately ready for the session, and make drivers use it. The only drivers that remain different are the Intel ones as they need to negotiate more with the firmware. Link: https://lore.kernel.org/r/1570007543-I152912660131cbab2e5d80b4218238c20f8a06e5@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 523c6a09e1c8..d69081c38788 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3095,7 +3095,9 @@ enum ieee80211_filter_flags { * * @IEEE80211_AMPDU_RX_START: start RX aggregation * @IEEE80211_AMPDU_RX_STOP: stop RX aggregation - * @IEEE80211_AMPDU_TX_START: start TX aggregation + * @IEEE80211_AMPDU_TX_START: start TX aggregation, the driver must either + * call ieee80211_start_tx_ba_cb_irqsafe() or return the special + * status %IEEE80211_AMPDU_TX_START_IMMEDIATE. * @IEEE80211_AMPDU_TX_OPERATIONAL: TX aggregation has become operational * @IEEE80211_AMPDU_TX_STOP_CONT: stop TX aggregation but continue transmitting * queued packets, now unaggregated. After all packets are transmitted the @@ -3119,6 +3121,8 @@ enum ieee80211_ampdu_mlme_action { IEEE80211_AMPDU_TX_OPERATIONAL, }; +#define IEEE80211_AMPDU_TX_START_IMMEDIATE 1 + /** * struct ieee80211_ampdu_params - AMPDU action parameters * @@ -3896,7 +3900,10 @@ struct ieee80211_ops { * * Even ``189`` would be wrong since 1 could be lost again. * - * Returns a negative error code on failure. + * Returns a negative error code on failure. The driver may return + * %IEEE80211_AMPDU_TX_START_IMMEDIATE for %IEEE80211_AMPDU_TX_START + * if the session can start immediately. + * * The callback can sleep. */ int (*ampdu_action)(struct ieee80211_hw *hw, -- cgit v1.2.3 From 7c550daffe22a97282effa75fe7c1f6b83563ecb Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Oct 2019 11:49:27 +0200 Subject: net: fib_notifier: make FIB notifier per-netns Currently all users of FIB notifier only cares about events in init_net. Later in this patchset, users get interested in other namespaces too. However, for every registered block user is interested only about one namespace. Make the FIB notifier registration per-netns and avoid unnecessary calls of notifier block for other namespaces. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 10 ++-------- include/net/fib_notifier.h | 7 +++---- include/net/ip6_fib.h | 2 +- include/net/ip_fib.h | 2 +- 4 files changed, 7 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 34de06b426ef..0931631bbc13 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -47,7 +47,6 @@ struct vif_entry_notifier_info { }; static inline int mr_call_vif_notifier(struct notifier_block *nb, - struct net *net, unsigned short family, enum fib_event_type event_type, struct vif_device *vif, @@ -56,7 +55,6 @@ static inline int mr_call_vif_notifier(struct notifier_block *nb, struct vif_entry_notifier_info info = { .info = { .family = family, - .net = net, }, .dev = vif->dev, .vif_index = vif_index, @@ -64,7 +62,7 @@ static inline int mr_call_vif_notifier(struct notifier_block *nb, .tb_id = tb_id, }; - return call_fib_notifier(nb, net, event_type, &info.info); + return call_fib_notifier(nb, event_type, &info.info); } static inline int mr_call_vif_notifiers(struct net *net, @@ -77,7 +75,6 @@ static inline int mr_call_vif_notifiers(struct net *net, struct vif_entry_notifier_info info = { .info = { .family = family, - .net = net, }, .dev = vif->dev, .vif_index = vif_index, @@ -173,7 +170,6 @@ struct mfc_entry_notifier_info { }; static inline int mr_call_mfc_notifier(struct notifier_block *nb, - struct net *net, unsigned short family, enum fib_event_type event_type, struct mr_mfc *mfc, u32 tb_id) @@ -181,13 +177,12 @@ static inline int mr_call_mfc_notifier(struct notifier_block *nb, struct mfc_entry_notifier_info info = { .info = { .family = family, - .net = net, }, .mfc = mfc, .tb_id = tb_id }; - return call_fib_notifier(nb, net, event_type, &info.info); + return call_fib_notifier(nb, event_type, &info.info); } static inline int mr_call_mfc_notifiers(struct net *net, @@ -199,7 +194,6 @@ static inline int mr_call_mfc_notifiers(struct net *net, struct mfc_entry_notifier_info info = { .info = { .family = family, - .net = net, }, .mfc = mfc, .tb_id = tb_id diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index c49d7bfb5c30..23353f67b2b0 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -8,7 +8,6 @@ struct module; struct fib_notifier_info { - struct net *net; int family; struct netlink_ext_ack *extack; }; @@ -35,14 +34,14 @@ struct fib_notifier_ops { struct rcu_head rcu; }; -int call_fib_notifier(struct notifier_block *nb, struct net *net, +int call_fib_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info); -int register_fib_notifier(struct notifier_block *nb, +int register_fib_notifier(struct net *net, struct notifier_block *nb, void (*cb)(struct notifier_block *nb)); -int unregister_fib_notifier(struct notifier_block *nb); +int unregister_fib_notifier(struct net *net, struct notifier_block *nb); struct fib_notifier_ops * fib_notifier_ops_register(const struct fib_notifier_ops *tmpl, struct net *net); void fib_notifier_ops_unregister(struct fib_notifier_ops *ops); diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 4b5656c71abc..14e9fca0e326 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -478,7 +478,7 @@ struct ipv6_route_iter { extern const struct seq_operations ipv6_route_seq_ops; -int call_fib6_notifier(struct notifier_block *nb, struct net *net, +int call_fib6_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info); int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index ab1ca9e238d2..a9df85304f40 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -219,7 +219,7 @@ struct fib_nh_notifier_info { struct fib_nh *fib_nh; }; -int call_fib4_notifier(struct notifier_block *nb, struct net *net, +int call_fib4_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info); int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, -- cgit v1.2.3 From 55c894f762a1a99fca80ee55d593083d78e7e4fb Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Oct 2019 11:49:28 +0200 Subject: net: fib_notifier: propagate possible error during fib notifier registration Unlike events for registered notifier, during the registration, the errors that happened for the block being registered are not propagated up to the caller. Make sure the error is propagated for FIB rules and entries. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index a9df85304f40..05c1fd9c5e23 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -229,7 +229,7 @@ int __net_init fib4_notifier_init(struct net *net); void __net_exit fib4_notifier_exit(struct net *net); void fib_info_notify_update(struct net *net, struct nl_info *info); -void fib_notify(struct net *net, struct notifier_block *nb); +int fib_notify(struct net *net, struct notifier_block *nb); struct fib_table { struct hlist_node tb_hlist; -- cgit v1.2.3 From b7a595577ef3dc9add2b3e6d00869d017306bfbe Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Oct 2019 11:49:30 +0200 Subject: net: fib_notifier: propagate extack down to the notifier block callback Since errors are propagated all the way up to the caller, propagate possible extack of the caller all the way down to the notifier block callback. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 18 ++++++++++++------ include/net/fib_notifier.h | 6 ++++-- include/net/fib_rules.h | 3 ++- include/net/ip6_fib.h | 9 ++++++--- include/net/ip_fib.h | 9 ++++++--- 5 files changed, 30 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 0931631bbc13..8071148f29a6 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -50,11 +50,13 @@ static inline int mr_call_vif_notifier(struct notifier_block *nb, unsigned short family, enum fib_event_type event_type, struct vif_device *vif, - unsigned short vif_index, u32 tb_id) + unsigned short vif_index, u32 tb_id, + struct netlink_ext_ack *extack) { struct vif_entry_notifier_info info = { .info = { .family = family, + .extack = extack, }, .dev = vif->dev, .vif_index = vif_index, @@ -172,11 +174,13 @@ struct mfc_entry_notifier_info { static inline int mr_call_mfc_notifier(struct notifier_block *nb, unsigned short family, enum fib_event_type event_type, - struct mr_mfc *mfc, u32 tb_id) + struct mr_mfc *mfc, u32 tb_id, + struct netlink_ext_ack *extack) { struct mfc_entry_notifier_info info = { .info = { .family = family, + .extack = extack, }, .mfc = mfc, .tb_id = tb_id @@ -295,10 +299,11 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, - struct notifier_block *nb), + struct notifier_block *nb, + struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), - rwlock_t *mrt_lock); + rwlock_t *mrt_lock, struct netlink_ext_ack *extack); #else static inline void vif_device_init(struct vif_device *v, struct net_device *dev, @@ -349,10 +354,11 @@ mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, static inline int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, - struct notifier_block *nb), + struct notifier_block *nb, + struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), - rwlock_t *mrt_lock) + rwlock_t *mrt_lock, struct netlink_ext_ack *extack) { return -EINVAL; } diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index 23353f67b2b0..6d59221ff05a 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -29,7 +29,8 @@ struct fib_notifier_ops { int family; struct list_head list; unsigned int (*fib_seq_read)(struct net *net); - int (*fib_dump)(struct net *net, struct notifier_block *nb); + int (*fib_dump)(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack); struct module *owner; struct rcu_head rcu; }; @@ -40,7 +41,8 @@ int call_fib_notifier(struct notifier_block *nb, int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info); int register_fib_notifier(struct net *net, struct notifier_block *nb, - void (*cb)(struct notifier_block *nb)); + void (*cb)(struct notifier_block *nb), + struct netlink_ext_ack *extack); int unregister_fib_notifier(struct net *net, struct notifier_block *nb); struct fib_notifier_ops * fib_notifier_ops_register(const struct fib_notifier_ops *tmpl, struct net *net); diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 20dcadd8eed9..54e227e6b06a 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -194,7 +194,8 @@ int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags, int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table, u32 flags); bool fib_rule_matchall(const struct fib_rule *rule); -int fib_rules_dump(struct net *net, struct notifier_block *nb, int family); +int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, + struct netlink_ext_ack *extack); unsigned int fib_rules_seq_read(struct net *net, int family); int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 14e9fca0e326..5d1615463138 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -488,7 +488,8 @@ int __net_init fib6_notifier_init(struct net *net); void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(struct net *net); -int fib6_tables_dump(struct net *net, struct notifier_block *nb); +int fib6_tables_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack); void fib6_update_sernum(struct net *net, struct fib6_info *rt); void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt); @@ -504,7 +505,8 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) int fib6_rules_init(void); void fib6_rules_cleanup(void); bool fib6_rule_default(const struct fib_rule *rule); -int fib6_rules_dump(struct net *net, struct notifier_block *nb); +int fib6_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack); unsigned int fib6_rules_seq_read(struct net *net); static inline bool fib6_rules_early_flow_dissect(struct net *net, @@ -537,7 +539,8 @@ static inline bool fib6_rule_default(const struct fib_rule *rule) { return true; } -static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb) +static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return 0; } diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 05c1fd9c5e23..52b2406a5dfc 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -229,7 +229,8 @@ int __net_init fib4_notifier_init(struct net *net); void __net_exit fib4_notifier_exit(struct net *net); void fib_info_notify_update(struct net *net, struct nl_info *info); -int fib_notify(struct net *net, struct notifier_block *nb); +int fib_notify(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack); struct fib_table { struct hlist_node tb_hlist; @@ -315,7 +316,8 @@ static inline bool fib4_rule_default(const struct fib_rule *rule) return true; } -static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb) +static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return 0; } @@ -377,7 +379,8 @@ out: } bool fib4_rule_default(const struct fib_rule *rule); -int fib4_rules_dump(struct net *net, struct notifier_block *nb); +int fib4_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack); unsigned int fib4_rules_seq_read(struct net *net); static inline bool fib4_rules_early_flow_dissect(struct net *net, -- cgit v1.2.3 From 471f894f106573b0b086d1003ee6172253c67b59 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Oct 2019 11:49:31 +0200 Subject: net: devlink: export devlink net getter Allow drivers to get net struct for devlink instance. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 23e4b65ec9df..5ac2be0f0857 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -771,6 +771,7 @@ static inline struct devlink *netdev_to_devlink(struct net_device *dev) struct ib_device; +struct net *devlink_net(const struct devlink *devlink); struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size); int devlink_register(struct devlink *devlink, struct device *dev); void devlink_unregister(struct devlink *devlink); -- cgit v1.2.3 From 070c63f20f6c739a3c534555f56c7327536bfcc2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Oct 2019 11:49:39 +0200 Subject: net: devlink: allow to change namespaces during reload All devlink instances are created in init_net and stay there for a lifetime. Allow user to be able to move devlink instances into namespaces during devlink reload operation. That ensures proper re-instantiation of driver objects, including netdevices. Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 2 +- include/uapi/linux/devlink.h | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 5ac2be0f0857..3c9d4a063c98 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -643,7 +643,7 @@ enum devlink_trap_group_generic_id { } struct devlink_ops { - int (*reload_down)(struct devlink *devlink, + int (*reload_down)(struct devlink *devlink, bool netns_change, struct netlink_ext_ack *extack); int (*reload_up)(struct devlink *devlink, struct netlink_ext_ack *extack); diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 580b7a2e40e1..b558ea88b766 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -421,6 +421,10 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_FAILED, /* u8 0 or 1 */ + DEVLINK_ATTR_NETNS_FD, /* u32 */ + DEVLINK_ATTR_NETNS_PID, /* u32 */ + DEVLINK_ATTR_NETNS_ID, /* u32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, -- cgit v1.2.3 From 25a3cd8189c8832c04225e6f1d41228fd6cc64cc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 Oct 2019 11:18:54 -0700 Subject: net/tls: move TOE-related structures to a separate header Move tls_device structure and register/unregister functions to a new header to avoid confusion with normal, non-TOE offload. Signed-off-by: Jakub Kicinski Reviewed-by: John Hurley Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls.h | 34 ------------------------ include/net/tls_toe.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 34 deletions(-) create mode 100644 include/net/tls_toe.h (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index c664e6dba0d1..57865c944095 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -60,7 +60,6 @@ #define TLS_RECORD_TYPE_DATA 0x17 #define TLS_AAD_SPACE_SIZE 13 -#define TLS_DEVICE_NAME_MAX 32 #define MAX_IV_SIZE 16 #define TLS_MAX_REC_SEQ_SIZE 8 @@ -74,37 +73,6 @@ */ #define TLS_AES_CCM_IV_B0_BYTE 2 -/* - * This structure defines the routines for Inline TLS driver. - * The following routines are optional and filled with a - * null pointer if not defined. - * - * @name: Its the name of registered Inline tls device - * @dev_list: Inline tls device list - * int (*feature)(struct tls_device *device); - * Called to return Inline TLS driver capability - * - * int (*hash)(struct tls_device *device, struct sock *sk); - * This function sets Inline driver for listen and program - * device specific functioanlity as required - * - * void (*unhash)(struct tls_device *device, struct sock *sk); - * This function cleans listen state set by Inline TLS driver - * - * void (*release)(struct kref *kref); - * Release the registered device and allocated resources - * @kref: Number of reference to tls_device - */ -struct tls_device { - char name[TLS_DEVICE_NAME_MAX]; - struct list_head dev_list; - int (*feature)(struct tls_device *device); - int (*hash)(struct tls_device *device, struct sock *sk); - void (*unhash)(struct tls_device *device, struct sock *sk); - void (*release)(struct kref *kref); - struct kref kref; -}; - enum { TLS_BASE, TLS_SW, @@ -643,8 +611,6 @@ static inline bool tls_offload_tx_resync_pending(struct sock *sk) int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type); -void tls_register_device(struct tls_device *device); -void tls_unregister_device(struct tls_device *device); int decrypt_skb(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgout); struct sk_buff *tls_encrypt_skb(struct sk_buff *skb); diff --git a/include/net/tls_toe.h b/include/net/tls_toe.h new file mode 100644 index 000000000000..81b66c76b31f --- /dev/null +++ b/include/net/tls_toe.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. + * Copyright (c) 2016-2017, Dave Watson . All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +struct sock; + +#define TLS_DEVICE_NAME_MAX 32 + +/* + * This structure defines the routines for Inline TLS driver. + * The following routines are optional and filled with a + * null pointer if not defined. + * + * @name: Its the name of registered Inline tls device + * @dev_list: Inline tls device list + * int (*feature)(struct tls_device *device); + * Called to return Inline TLS driver capability + * + * int (*hash)(struct tls_device *device, struct sock *sk); + * This function sets Inline driver for listen and program + * device specific functioanlity as required + * + * void (*unhash)(struct tls_device *device, struct sock *sk); + * This function cleans listen state set by Inline TLS driver + * + * void (*release)(struct kref *kref); + * Release the registered device and allocated resources + * @kref: Number of reference to tls_device + */ +struct tls_device { + char name[TLS_DEVICE_NAME_MAX]; + struct list_head dev_list; + int (*feature)(struct tls_device *device); + int (*hash)(struct tls_device *device, struct sock *sk); + void (*unhash)(struct tls_device *device, struct sock *sk); + void (*release)(struct kref *kref); + struct kref kref; +}; + +void tls_register_device(struct tls_device *device); +void tls_unregister_device(struct tls_device *device); -- cgit v1.2.3 From f21912edd1570818cbcb16bd1da7d7a2b122d66b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 Oct 2019 11:18:55 -0700 Subject: net/tls: rename tls_device to tls_toe_device Rename struct tls_device to struct tls_toe_device to avoid confusion with normal, non-TOE offload. No functional changes. Signed-off-by: Jakub Kicinski Reviewed-by: John Hurley Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls_toe.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/tls_toe.h b/include/net/tls_toe.h index 81b66c76b31f..b56d30a5bd6d 100644 --- a/include/net/tls_toe.h +++ b/include/net/tls_toe.h @@ -36,7 +36,7 @@ struct sock; -#define TLS_DEVICE_NAME_MAX 32 +#define TLS_TOE_DEVICE_NAME_MAX 32 /* * This structure defines the routines for Inline TLS driver. @@ -45,29 +45,29 @@ struct sock; * * @name: Its the name of registered Inline tls device * @dev_list: Inline tls device list - * int (*feature)(struct tls_device *device); + * int (*feature)(struct tls_toe_device *device); * Called to return Inline TLS driver capability * - * int (*hash)(struct tls_device *device, struct sock *sk); + * int (*hash)(struct tls_toe_device *device, struct sock *sk); * This function sets Inline driver for listen and program * device specific functioanlity as required * - * void (*unhash)(struct tls_device *device, struct sock *sk); + * void (*unhash)(struct tls_toe_device *device, struct sock *sk); * This function cleans listen state set by Inline TLS driver * * void (*release)(struct kref *kref); * Release the registered device and allocated resources - * @kref: Number of reference to tls_device + * @kref: Number of reference to tls_toe_device */ -struct tls_device { - char name[TLS_DEVICE_NAME_MAX]; +struct tls_toe_device { + char name[TLS_TOE_DEVICE_NAME_MAX]; struct list_head dev_list; - int (*feature)(struct tls_device *device); - int (*hash)(struct tls_device *device, struct sock *sk); - void (*unhash)(struct tls_device *device, struct sock *sk); + int (*feature)(struct tls_toe_device *device); + int (*hash)(struct tls_toe_device *device, struct sock *sk); + void (*unhash)(struct tls_toe_device *device, struct sock *sk); void (*release)(struct kref *kref); struct kref kref; }; -void tls_register_device(struct tls_device *device); -void tls_unregister_device(struct tls_device *device); +void tls_toe_register_device(struct tls_toe_device *device); +void tls_toe_unregister_device(struct tls_toe_device *device); -- cgit v1.2.3 From 08700dab816847d5e600ef263155fb04ea4b312d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 Oct 2019 11:18:57 -0700 Subject: net/tls: move TOE-related code to a separate file Move tls_hw_* functions to a new, separate source file to avoid confusion with normal, non-TOE offload. Signed-off-by: Jakub Kicinski Reviewed-by: John Hurley Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls.h | 3 +++ include/net/tls_toe.h | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 57865c944095..5c48cb9e0c18 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -308,7 +308,10 @@ struct tls_offload_context_rx { #define TLS_OFFLOAD_CONTEXT_SIZE_RX \ (sizeof(struct tls_offload_context_rx) + TLS_DRIVER_STATE_SIZE_RX) +struct tls_context *tls_ctx_create(struct sock *sk); void tls_ctx_free(struct sock *sk, struct tls_context *ctx); +void update_sk_prot(struct sock *sk, struct tls_context *ctx); + int wait_on_pending_writer(struct sock *sk, long *timeo); int tls_sk_query(struct sock *sk, int optname, char __user *optval, int __user *optlen); diff --git a/include/net/tls_toe.h b/include/net/tls_toe.h index b56d30a5bd6d..3bb39c795aed 100644 --- a/include/net/tls_toe.h +++ b/include/net/tls_toe.h @@ -69,5 +69,9 @@ struct tls_toe_device { struct kref kref; }; +int tls_hw_prot(struct sock *sk); +int tls_hw_hash(struct sock *sk); +void tls_hw_unhash(struct sock *sk); + void tls_toe_register_device(struct tls_toe_device *device); void tls_toe_unregister_device(struct tls_toe_device *device); -- cgit v1.2.3 From 0eb8745e03c9ed2a7412c7a844ebc4f0e4f80de4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 Oct 2019 11:18:58 -0700 Subject: net/tls: rename tls_hw_* functions tls_toe_* The tls_hw_* functions are quite confusingly named, since they are related to the TOE-offload, not TLS_HW offload which doesn't require TOE. Rename them. Signed-off-by: Jakub Kicinski Reviewed-by: John Hurley Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls_toe.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tls_toe.h b/include/net/tls_toe.h index 3bb39c795aed..b3aa7593ce2c 100644 --- a/include/net/tls_toe.h +++ b/include/net/tls_toe.h @@ -69,9 +69,9 @@ struct tls_toe_device { struct kref kref; }; -int tls_hw_prot(struct sock *sk); -int tls_hw_hash(struct sock *sk); -void tls_hw_unhash(struct sock *sk); +int tls_toe_bypass(struct sock *sk); +int tls_toe_hash(struct sock *sk); +void tls_toe_unhash(struct sock *sk); void tls_toe_register_device(struct tls_toe_device *device); void tls_toe_unregister_device(struct tls_toe_device *device); -- cgit v1.2.3 From d6547f2a2cfc8b145b59291d3e4b072891f34882 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 3 Oct 2019 23:29:24 +0300 Subject: net, uapi: fix -Wpointer-arith warnings Add casts to fix these warnings: ./usr/include/linux/netfilter_arp/arp_tables.h:200:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/netfilter_bridge/ebtables.h:197:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/netfilter_ipv4/ip_tables.h:223:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/netfilter_ipv6/ip6_tables.h:263:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/tipc_config.h:310:28: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/tipc_config.h:410:24: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/virtio_ring.h:170:16: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] Those are theoretical probably but kernel doesn't control compiler flags in userspace. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/uapi/linux/netfilter_arp/arp_tables.h | 2 +- include/uapi/linux/netfilter_bridge/ebtables.h | 2 +- include/uapi/linux/netfilter_ipv4/ip_tables.h | 2 +- include/uapi/linux/netfilter_ipv6/ip6_tables.h | 2 +- include/uapi/linux/tipc_config.h | 4 ++-- include/uapi/linux/virtio_ring.h | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter_arp/arp_tables.h b/include/uapi/linux/netfilter_arp/arp_tables.h index a2a0927d9bd6..bbf5af2b67a8 100644 --- a/include/uapi/linux/netfilter_arp/arp_tables.h +++ b/include/uapi/linux/netfilter_arp/arp_tables.h @@ -199,7 +199,7 @@ struct arpt_get_entries { /* Helper functions */ static __inline__ struct xt_entry_target *arpt_get_target(struct arpt_entry *e) { - return (void *)e + e->target_offset; + return (struct xt_entry_target *)((char *)e + e->target_offset); } /* diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h index 8076c940ffeb..a494cf43a755 100644 --- a/include/uapi/linux/netfilter_bridge/ebtables.h +++ b/include/uapi/linux/netfilter_bridge/ebtables.h @@ -194,7 +194,7 @@ struct ebt_entry { static __inline__ struct ebt_entry_target * ebt_get_target(struct ebt_entry *e) { - return (void *)e + e->target_offset; + return (struct ebt_entry_target *)((char *)e + e->target_offset); } /* {g,s}etsockopt numbers */ diff --git a/include/uapi/linux/netfilter_ipv4/ip_tables.h b/include/uapi/linux/netfilter_ipv4/ip_tables.h index 6aaeb14bfce1..50c7fee625ae 100644 --- a/include/uapi/linux/netfilter_ipv4/ip_tables.h +++ b/include/uapi/linux/netfilter_ipv4/ip_tables.h @@ -222,7 +222,7 @@ struct ipt_get_entries { static __inline__ struct xt_entry_target * ipt_get_target(struct ipt_entry *e) { - return (void *)e + e->target_offset; + return (struct xt_entry_target *)((char *)e + e->target_offset); } /* diff --git a/include/uapi/linux/netfilter_ipv6/ip6_tables.h b/include/uapi/linux/netfilter_ipv6/ip6_tables.h index 031d0a43bed2..d9e364f96a5c 100644 --- a/include/uapi/linux/netfilter_ipv6/ip6_tables.h +++ b/include/uapi/linux/netfilter_ipv6/ip6_tables.h @@ -262,7 +262,7 @@ struct ip6t_get_entries { static __inline__ struct xt_entry_target * ip6t_get_target(struct ip6t_entry *e) { - return (void *)e + e->target_offset; + return (struct xt_entry_target *)((char *)e + e->target_offset); } /* diff --git a/include/uapi/linux/tipc_config.h b/include/uapi/linux/tipc_config.h index 4955e1a9f1bc..4dfc05651c98 100644 --- a/include/uapi/linux/tipc_config.h +++ b/include/uapi/linux/tipc_config.h @@ -309,7 +309,7 @@ static inline int TLV_SET(void *tlv, __u16 type, void *data, __u16 len) tlv_ptr->tlv_len = htons(tlv_len); if (len && data) { memcpy(TLV_DATA(tlv_ptr), data, len); - memset(TLV_DATA(tlv_ptr) + len, 0, TLV_SPACE(len) - tlv_len); + memset((char *)TLV_DATA(tlv_ptr) + len, 0, TLV_SPACE(len) - tlv_len); } return TLV_SPACE(len); } @@ -409,7 +409,7 @@ static inline int TCM_SET(void *msg, __u16 cmd, __u16 flags, tcm_hdr->tcm_flags = htons(flags); if (data_len && data) { memcpy(TCM_DATA(msg), data, data_len); - memset(TCM_DATA(msg) + data_len, 0, TCM_SPACE(data_len) - msg_len); + memset((char *)TCM_DATA(msg) + data_len, 0, TCM_SPACE(data_len) - msg_len); } return TCM_SPACE(data_len); } diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h index 4c4e24c291a5..559f42e73315 100644 --- a/include/uapi/linux/virtio_ring.h +++ b/include/uapi/linux/virtio_ring.h @@ -169,7 +169,7 @@ static inline void vring_init(struct vring *vr, unsigned int num, void *p, { vr->num = num; vr->desc = p; - vr->avail = p + num*sizeof(struct vring_desc); + vr->avail = (struct vring_avail *)((char *)p + num * sizeof(struct vring_desc)); vr->used = (void *)(((uintptr_t)&vr->avail->ring[num] + sizeof(__virtio16) + align-1) & ~(align - 1)); } -- cgit v1.2.3 From 193d357d087309f2d5ab8e8caab1af5e3bc29fa0 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 3 Oct 2019 23:56:37 +0300 Subject: net: spread "enum sock_flags" Some ints are "enum sock_flags" in fact. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 2c53f1a1d905..ab905c4b1f0e 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2512,7 +2512,7 @@ static inline bool sk_listener(const struct sock *sk) return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); } -void sock_enable_timestamp(struct sock *sk, int flag); +void sock_enable_timestamp(struct sock *sk, enum sock_flags flag); int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type); -- cgit v1.2.3 From 8538d29cea9530f114159e06bfa31b2358161493 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 16:19:22 -0700 Subject: net/tls: add tracing for device/offload events Add tracing of device-related interaction to aid performance analysis, especially around resync: tls:tls_device_offload_set tls:tls_device_rx_resync_send tls:tls_device_rx_resync_nh_schedule tls:tls_device_rx_resync_nh_delay tls:tls_device_tx_resync_req tls:tls_device_tx_resync_send Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/tls.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 5c48cb9e0c18..38086ade65ce 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -594,13 +594,6 @@ tls_offload_rx_resync_set_type(struct sock *sk, enum tls_offload_sync_type type) tls_offload_ctx_rx(tls_ctx)->resync_type = type; } -static inline void tls_offload_tx_resync_request(struct sock *sk) -{ - struct tls_context *tls_ctx = tls_get_ctx(sk); - - WARN_ON(test_and_set_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags)); -} - /* Driver's seq tracking has to be disabled until resync succeeded */ static inline bool tls_offload_tx_resync_pending(struct sock *sk) { @@ -634,6 +627,7 @@ void tls_device_free_resources_tx(struct sock *sk); int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx); void tls_device_offload_cleanup_rx(struct sock *sk); void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq); +void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq); int tls_device_decrypted(struct sock *sk, struct sk_buff *skb); #else static inline void tls_device_init(void) {} -- cgit v1.2.3 From d26b698dd3cd52f5a3277446a87e5e0198c99cd0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 16:19:24 -0700 Subject: net/tls: add skeleton of MIB statistics Add a skeleton structure for adding TLS statistics. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/netns/mib.h | 3 +++ include/net/snmp.h | 6 ++++++ include/net/tls.h | 13 +++++++++++++ include/uapi/linux/snmp.h | 7 +++++++ 4 files changed, 29 insertions(+) (limited to 'include') diff --git a/include/net/netns/mib.h b/include/net/netns/mib.h index 830bdf345b17..b5fdb108d602 100644 --- a/include/net/netns/mib.h +++ b/include/net/netns/mib.h @@ -24,6 +24,9 @@ struct netns_mib { #ifdef CONFIG_XFRM_STATISTICS DEFINE_SNMP_STAT(struct linux_xfrm_mib, xfrm_statistics); #endif +#if IS_ENABLED(CONFIG_TLS) + DEFINE_SNMP_STAT(struct linux_tls_mib, tls_statistics); +#endif }; #endif diff --git a/include/net/snmp.h b/include/net/snmp.h index cb8ced4380a6..468a67836e2f 100644 --- a/include/net/snmp.h +++ b/include/net/snmp.h @@ -111,6 +111,12 @@ struct linux_xfrm_mib { unsigned long mibs[LINUX_MIB_XFRMMAX]; }; +/* Linux TLS */ +#define LINUX_MIB_TLSMAX __LINUX_MIB_TLSMAX +struct linux_tls_mib { + unsigned long mibs[LINUX_MIB_TLSMAX]; +}; + #define DEFINE_SNMP_STAT(type, name) \ __typeof__(type) __percpu *name #define DEFINE_SNMP_STAT_ATOMIC(type, name) \ diff --git a/include/net/tls.h b/include/net/tls.h index 38086ade65ce..24c37bffc961 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -43,6 +43,7 @@ #include #include +#include #include #include #include @@ -73,6 +74,15 @@ */ #define TLS_AES_CCM_IV_B0_BYTE 2 +#define __TLS_INC_STATS(net, field) \ + __SNMP_INC_STATS((net)->mib.tls_statistics, field) +#define TLS_INC_STATS(net, field) \ + SNMP_INC_STATS((net)->mib.tls_statistics, field) +#define __TLS_DEC_STATS(net, field) \ + __SNMP_DEC_STATS((net)->mib.tls_statistics, field) +#define TLS_DEC_STATS(net, field) \ + SNMP_DEC_STATS((net)->mib.tls_statistics, field) + enum { TLS_BASE, TLS_SW, @@ -605,6 +615,9 @@ static inline bool tls_offload_tx_resync_pending(struct sock *sk) return ret; } +int __net_init tls_proc_init(struct net *net); +void __net_exit tls_proc_fini(struct net *net); + int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type); int decrypt_skb(struct sock *sk, struct sk_buff *skb, diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 549a31c29f7d..4abd57948ad4 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -323,4 +323,11 @@ enum __LINUX_MIB_XFRMMAX }; +/* linux TLS mib definitions */ +enum +{ + LINUX_MIB_TLSNUM = 0, + __LINUX_MIB_TLSMAX +}; + #endif /* _LINUX_SNMP_H */ -- cgit v1.2.3 From b32fd3cc31d723bf2ab859667be3612c0086ec72 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 16:19:25 -0700 Subject: net/tls: add statistics for installed sessions Add SNMP stats for number of sockets with successfully installed sessions. Break them down to software and hardware ones. Note that if hardware offload fails stack uses software implementation, and counts the session appropriately. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 4abd57948ad4..1b4613b5af70 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -327,6 +327,14 @@ enum enum { LINUX_MIB_TLSNUM = 0, + LINUX_MIB_TLSCURRTXSW, /* TlsCurrTxSw */ + LINUX_MIB_TLSCURRRXSW, /* TlsCurrRxSw */ + LINUX_MIB_TLSCURRTXDEVICE, /* TlsCurrTxDevice */ + LINUX_MIB_TLSCURRRXDEVICE, /* TlsCurrRxDevice */ + LINUX_MIB_TLSTXSW, /* TlsTxSw */ + LINUX_MIB_TLSRXSW, /* TlsRxSw */ + LINUX_MIB_TLSTXDEVICE, /* TlsTxDevice */ + LINUX_MIB_TLSRXDEVICE, /* TlsRxDevice */ __LINUX_MIB_TLSMAX }; -- cgit v1.2.3 From 5c5ec66858062a857cf51f57cbe52b36330f7ae6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 16:19:26 -0700 Subject: net/tls: add TlsDecryptError stat Add a statistic for TLS record decryption errors. Since devices are supposed to pass records as-is when they encounter errors this statistic will count bad records in both pure software and inline crypto configurations. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 1b4613b5af70..c9e4963e26f0 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -335,6 +335,7 @@ enum LINUX_MIB_TLSRXSW, /* TlsRxSw */ LINUX_MIB_TLSTXDEVICE, /* TlsTxDevice */ LINUX_MIB_TLSRXDEVICE, /* TlsRxDevice */ + LINUX_MIB_TLSDECRYPTERROR, /* TlsDecryptError */ __LINUX_MIB_TLSMAX }; -- cgit v1.2.3 From a4d26fdbc2a5414bb1b67198656cc7e24a4a3c3a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 16:19:27 -0700 Subject: net/tls: add TlsDeviceRxResync statistic Add a statistic for number of RX resyncs sent down to the NIC. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index c9e4963e26f0..7eee233e78d2 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -336,6 +336,7 @@ enum LINUX_MIB_TLSTXDEVICE, /* TlsTxDevice */ LINUX_MIB_TLSRXDEVICE, /* TlsRxDevice */ LINUX_MIB_TLSDECRYPTERROR, /* TlsDecryptError */ + LINUX_MIB_TLSRXDEVICERESYNC, /* TlsRxDeviceResync */ __LINUX_MIB_TLSMAX }; -- cgit v1.2.3 From 8273fd845447820c26b38821c8ac297f40a65260 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 5 Oct 2019 08:10:31 +0200 Subject: net: devlink: export devlink net setter For newly allocated devlink instance allow drivers to set net struct Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 3c9d4a063c98..4095657fc23f 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -39,6 +39,7 @@ struct devlink { possible_net_t _net; struct mutex lock; bool reload_failed; + bool registered; char priv[0] __aligned(NETDEV_ALIGN); }; @@ -772,6 +773,7 @@ static inline struct devlink *netdev_to_devlink(struct net_device *dev) struct ib_device; struct net *devlink_net(const struct devlink *devlink); +void devlink_net_set(struct devlink *devlink, struct net *net); struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size); int devlink_register(struct devlink *devlink, struct device *dev); void devlink_unregister(struct devlink *devlink); -- cgit v1.2.3 From 1927f41a22a05e3bc178fa47f7ce7be271fbc541 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 5 Oct 2019 20:04:34 +0200 Subject: net: genetlink: introduce dump info struct to be available during dumpit op Currently the cb->data is taken by ops during non-parallel dumping. Introduce a new structure genl_dumpit_info and store the ops there. Distribute the info to both non-parallel and parallel dumping. Also add a helper genl_dumpit_info() to easily get the info structure in the dumpit callback from cb. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/genetlink.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 9292f1c588b7..fb838f4b0089 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -127,6 +127,20 @@ enum genl_validate_flags { GENL_DONT_VALIDATE_DUMP_STRICT = BIT(2), }; +/** + * struct genl_info - info that is available during dumpit op call + * @ops: generic netlink ops - for internal genl code usage + */ +struct genl_dumpit_info { + const struct genl_ops *ops; +}; + +static inline const struct genl_dumpit_info * +genl_dumpit_info(struct netlink_callback *cb) +{ + return cb->data; +} + /** * struct genl_ops - generic netlink operations * @cmd: command identifier -- cgit v1.2.3 From bf813b0afeae2f012f0e527a526c1b78ca21ad82 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 5 Oct 2019 20:04:36 +0200 Subject: net: genetlink: parse attrs and store in contect info struct during dumpit Extend the dumpit info struct for attrs. Instead of existing attribute validation do parse them and save in the info struct. Caller can benefit from this and does not have to do parse itself. In order to properly free attrs, genl_family pointer needs to be added to dumpit info struct as well. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/genetlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index fb838f4b0089..922dcc9348b1 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -129,10 +129,14 @@ enum genl_validate_flags { /** * struct genl_info - info that is available during dumpit op call + * @family: generic netlink family - for internal genl code usage * @ops: generic netlink ops - for internal genl code usage + * @attrs: netlink attributes */ struct genl_dumpit_info { + const struct genl_family *family; const struct genl_ops *ops; + struct nlattr **attrs; }; static inline const struct genl_dumpit_info * -- cgit v1.2.3 From 265ecd4fa3f0ca43909f8b2cc0e519966f21b167 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 5 Oct 2019 20:04:41 +0200 Subject: net: genetlink: remove unused genl_family_attrbuf() genl_family_attrbuf() function is no longer used by anyone, so remove it. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/genetlink.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 922dcc9348b1..74950663bb00 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -75,8 +75,6 @@ struct genl_family { struct module *module; }; -struct nlattr **genl_family_attrbuf(const struct genl_family *family); - /** * struct genl_info - receiving information * @snd_seq: sending sequence number -- cgit v1.2.3 From 5f0e5412781b01708f622d00c0b3f77b9dca7367 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 6 Oct 2019 20:07:36 -0700 Subject: uapi/bpf: fix helper docs Various small fixes to BPF helper documentation comments, enabling automatic header generation with a list of BPF helpers. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 77c6be96d676..a65c3b0c6935 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -794,7 +794,7 @@ union bpf_attr { * A 64-bit integer containing the current GID and UID, and * created as such: *current_gid* **<< 32 \|** *current_uid*. * - * int bpf_get_current_comm(char *buf, u32 size_of_buf) + * int bpf_get_current_comm(void *buf, u32 size_of_buf) * Description * Copy the **comm** attribute of the current task into *buf* of * *size_of_buf*. The **comm** attribute contains the name of @@ -1023,7 +1023,7 @@ union bpf_attr { * The realm of the route for the packet associated to *skb*, or 0 * if none was found. * - * int bpf_perf_event_output(struct pt_regs *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * int bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -1068,7 +1068,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) + * int bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) * Description * This helper was provided as an easy way to load data from a * packet. It can be used to load *len* bytes from *offset* from @@ -1085,7 +1085,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stackid(struct pt_regs *ctx, struct bpf_map *map, u64 flags) + * int bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) * Description * Walk a user or a kernel stack and return its id. To achieve * this, the helper needs *ctx*, which is a pointer to the context @@ -1154,7 +1154,7 @@ union bpf_attr { * The checksum result, or a negative error code in case of * failure. * - * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) + * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Retrieve tunnel options metadata for the packet associated to * *skb*, and store the raw tunnel option data to the buffer *opt* @@ -1172,7 +1172,7 @@ union bpf_attr { * Return * The size of the option data retrieved. * - * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) + * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Set tunnel options metadata for the packet associated to *skb* * to the option data contained in the raw buffer *opt* of *size*. @@ -1511,7 +1511,7 @@ union bpf_attr { * Return * 0 * - * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) + * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1595,7 +1595,7 @@ union bpf_attr { * Return * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. * - * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and @@ -1715,7 +1715,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) + * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1947,7 +1947,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags) + * int bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *ctx*, which is a pointer @@ -1980,7 +1980,7 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * - * int bpf_skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) + * int bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that * it provides an easy way to load *len* bytes from *offset* @@ -2033,7 +2033,7 @@ union bpf_attr { * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * - * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) + * int bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. * The *skops* is used as a new value for the entry associated to @@ -2392,7 +2392,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags) + * int bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * For socket policies, insert *len* bytes into *msg* at offset * *start*. @@ -2408,9 +2408,9 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags) + * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description - * Will remove *pop* bytes from a *msg* starting at byte *start*. + * Will remove *len* bytes from a *msg* starting at byte *start*. * This may result in **ENOMEM** errors under certain situations if * an allocation and copy are required due to a full ring buffer. * However, the helper will try to avoid doing the allocation @@ -2505,7 +2505,7 @@ union bpf_attr { * A **struct bpf_tcp_sock** pointer on success, or **NULL** in * case of failure. * - * int bpf_skb_ecn_set_ce(struct sk_buf *skb) + * int bpf_skb_ecn_set_ce(struct sk_buff *skb) * Description * Set ECN (Explicit Congestion Notification) field of IP header * to **CE** (Congestion Encountered) if current value is **ECT** -- cgit v1.2.3 From b9df4fd7e99cb8bfd80c4143f3045d63b1754ad0 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 6 Oct 2019 18:19:54 +0200 Subject: net: core: change return type of pskb_may_pull to bool This function de-facto returns a bool, so let's change the return type accordingly. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4351577b14d7..0a58402a166e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2261,12 +2261,12 @@ static inline void *pskb_pull(struct sk_buff *skb, unsigned int len) return unlikely(len > skb->len) ? NULL : __pskb_pull(skb, len); } -static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len) +static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len) { if (likely(len <= skb_headlen(skb))) - return 1; + return true; if (unlikely(len > skb->len)) - return 0; + return false; return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL; } -- cgit v1.2.3 From 328908621081c3c7455c39549c5334e74b7c525a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 7 Oct 2019 09:37:27 -0400 Subject: ipv6: Make ipv6_mc_may_pull() return bool. Consistent with how pskb_may_pull() also now does so. Signed-off-by: David S. Miller --- include/net/addrconf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 3f62b347b04a..1bab88184d3c 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -202,11 +202,11 @@ u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr, /* * multicast prototypes (mcast.c) */ -static inline int ipv6_mc_may_pull(struct sk_buff *skb, - unsigned int len) +static inline bool ipv6_mc_may_pull(struct sk_buff *skb, + unsigned int len) { if (skb_transport_offset(skb) + ipv6_transport_len(skb) < len) - return 0; + return false; return pskb_may_pull(skb, len); } -- cgit v1.2.3 From 163ab96b52ae2bb2d8f188cd29f0b570610f9007 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 6 Oct 2019 21:09:27 -0700 Subject: net: sockmap: use bitmap for copy info Don't use bool array in struct sk_msg_sg, save 12 bytes. Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- include/linux/skmsg.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e4b3fb4bb77c..fe80d537945d 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -28,13 +28,14 @@ struct sk_msg_sg { u32 end; u32 size; u32 copybreak; - bool copy[MAX_MSG_FRAGS]; + unsigned long copy; /* The extra element is used for chaining the front and sections when * the list becomes partitioned (e.g. end < start). The crypto APIs * require the chaining. */ struct scatterlist data[MAX_MSG_FRAGS + 1]; }; +static_assert(BITS_PER_LONG >= MAX_MSG_FRAGS); /* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { @@ -227,7 +228,7 @@ static inline void sk_msg_compute_data_pointers(struct sk_msg *msg) { struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start); - if (msg->sg.copy[msg->sg.start]) { + if (test_bit(msg->sg.start, &msg->sg.copy)) { msg->data = NULL; msg->data_end = NULL; } else { @@ -246,7 +247,7 @@ static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, sg_set_page(sge, page, len, offset); sg_unmark_end(sge); - msg->sg.copy[msg->sg.end] = true; + __set_bit(msg->sg.end, &msg->sg.copy); msg->sg.size += len; sk_msg_iter_next(msg, end); } @@ -254,7 +255,10 @@ static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state) { do { - msg->sg.copy[i] = copy_state; + if (copy_state) + __set_bit(i, &msg->sg.copy); + else + __clear_bit(i, &msg->sg.copy); sk_msg_iter_var_next(i); if (i == msg->sg.end) break; -- cgit v1.2.3 From 4de30a8d58c90e18140342cdcb74903d2e4fbb62 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 6 Oct 2019 21:09:30 -0700 Subject: net/tls: pass context to tls_device_decrypted() Avoid unnecessary pointer chasing and calculations, callers already have most of the state tls_device_decrypted() needs. Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- include/net/tls.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 24c37bffc961..b809f2362049 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -641,7 +641,8 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx); void tls_device_offload_cleanup_rx(struct sock *sk); void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq); void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq); -int tls_device_decrypted(struct sock *sk, struct sk_buff *skb); +int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx, + struct sk_buff *skb, struct strp_msg *rxm); #else static inline void tls_device_init(void) {} static inline void tls_device_cleanup(void) {} @@ -664,7 +665,9 @@ static inline void tls_device_offload_cleanup_rx(struct sock *sk) {} static inline void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) {} -static inline int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) +static inline int +tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx, + struct sk_buff *skb, struct strp_msg *rxm) { return 0; } -- cgit v1.2.3 From 5c5458ec9d631fbca29f53a944168265e18aa77a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 6 Oct 2019 21:09:31 -0700 Subject: net/tls: store async_capable on a single bit Store async_capable on a single bit instead of a full integer to save space. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index b809f2362049..97eae7271a67 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -136,7 +136,7 @@ struct tls_sw_context_tx { struct list_head tx_list; atomic_t encrypt_pending; int async_notify; - int async_capable; + u8 async_capable:1; #define BIT_TX_SCHEDULED 0 #define BIT_TX_CLOSING 1 @@ -152,7 +152,7 @@ struct tls_sw_context_rx { struct sk_buff *recv_pkt; u8 control; - int async_capable; + u8 async_capable:1; bool decrypted; atomic_t decrypt_pending; bool async_notify; -- cgit v1.2.3 From bc76e5bb1229ede1f26317b813099b0e983e4009 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 6 Oct 2019 21:09:32 -0700 Subject: net/tls: store decrypted on a single bit Use a single bit instead of boolean to remember if packet was already decrypted. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 97eae7271a67..41265e542e71 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -153,7 +153,7 @@ struct tls_sw_context_rx { struct sk_buff *recv_pkt; u8 control; u8 async_capable:1; - bool decrypted; + u8 decrypted:1; atomic_t decrypt_pending; bool async_notify; }; -- cgit v1.2.3 From 017f77c050a3bc1f1ff877d1f265beeee26d7dea Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Thu, 3 Oct 2019 20:56:01 +0100 Subject: netfilter: ipset: add a coding-style fix to ip_set_ext_destroy. Use a local variable to hold comment in order to align the arguments of ip_set_comment_free properly. Signed-off-by: Jeremy Sowden Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 9bc255a8461b..9fee4837d02c 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -269,9 +269,11 @@ ip_set_ext_destroy(struct ip_set *set, void *data) /* Check that the extension is enabled for the set and * call it's destroy function for its extension part in data. */ - if (SET_WITH_COMMENT(set)) - ip_set_extensions[IPSET_EXT_ID_COMMENT].destroy( - set, ext_comment(data, set)); + if (SET_WITH_COMMENT(set)) { + struct ip_set_comment *c = ext_comment(data, set); + + ip_set_extensions[IPSET_EXT_ID_COMMENT].destroy(set, c); + } } static inline int -- cgit v1.2.3 From 94177f6e11c74b6ca3bcf7f65d3d74f00bbd6a8c Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Thu, 3 Oct 2019 20:56:03 +0100 Subject: netfilter: ipset: move ip_set_comment functions from ip_set.h to ip_set_core.c. Most of the functions are only called from within ip_set_core.c. The exception is ip_set_init_comment. However, this is too complex to be a good candidate for a static inline function. Move it to ip_set_core.c, change its linkage to extern and export it, leaving a declaration in ip_set.h. ip_set_comment_free is only used as an extension destructor, so change its prototype to match and drop cast. Signed-off-by: Jeremy Sowden Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 63 ++-------------------------------- 1 file changed, 2 insertions(+), 61 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 9fee4837d02c..985c9bb1ab65 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -521,67 +521,8 @@ ip_set_timeout_get(const unsigned long *timeout) return t == 0 ? 1 : t; } -static inline char* -ip_set_comment_uget(struct nlattr *tb) -{ - return nla_data(tb); -} - -/* Called from uadd only, protected by the set spinlock. - * The kadt functions don't use the comment extensions in any way. - */ -static inline void -ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, - const struct ip_set_ext *ext) -{ - struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1); - size_t len = ext->comment ? strlen(ext->comment) : 0; - - if (unlikely(c)) { - set->ext_size -= sizeof(*c) + strlen(c->str) + 1; - kfree_rcu(c, rcu); - rcu_assign_pointer(comment->c, NULL); - } - if (!len) - return; - if (unlikely(len > IPSET_MAX_COMMENT_SIZE)) - len = IPSET_MAX_COMMENT_SIZE; - c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC); - if (unlikely(!c)) - return; - strlcpy(c->str, ext->comment, len + 1); - set->ext_size += sizeof(*c) + strlen(c->str) + 1; - rcu_assign_pointer(comment->c, c); -} - -/* Used only when dumping a set, protected by rcu_read_lock() */ -static inline int -ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment) -{ - struct ip_set_comment_rcu *c = rcu_dereference(comment->c); - - if (!c) - return 0; - return nla_put_string(skb, IPSET_ATTR_COMMENT, c->str); -} - -/* Called from uadd/udel, flush or the garbage collectors protected - * by the set spinlock. - * Called when the set is destroyed and when there can't be any user - * of the set data anymore. - */ -static inline void -ip_set_comment_free(struct ip_set *set, struct ip_set_comment *comment) -{ - struct ip_set_comment_rcu *c; - - c = rcu_dereference_protected(comment->c, 1); - if (unlikely(!c)) - return; - set->ext_size -= sizeof(*c) + strlen(c->str) + 1; - kfree_rcu(c, rcu); - rcu_assign_pointer(comment->c, NULL); -} +void ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, + const struct ip_set_ext *ext); static inline void ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter) -- cgit v1.2.3 From 2398a97688f1aaca09d0a5a809f361e2abf5ff3c Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Thu, 3 Oct 2019 20:56:04 +0100 Subject: netfilter: ipset: move functions to ip_set_core.c. Several inline functions in ip_set.h are only called in ip_set_core.c: move them and remove inline function specifier. Signed-off-by: Jeremy Sowden Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 102 --------------------------------- 1 file changed, 102 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 985c9bb1ab65..44f6de8a1733 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -508,86 +508,9 @@ ip_set_timeout_set(unsigned long *timeout, u32 value) *timeout = t; } -static inline u32 -ip_set_timeout_get(const unsigned long *timeout) -{ - u32 t; - - if (*timeout == IPSET_ELEM_PERMANENT) - return 0; - - t = jiffies_to_msecs(*timeout - jiffies)/MSEC_PER_SEC; - /* Zero value in userspace means no timeout */ - return t == 0 ? 1 : t; -} - void ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, const struct ip_set_ext *ext); -static inline void -ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter) -{ - atomic64_add((long long)bytes, &(counter)->bytes); -} - -static inline void -ip_set_add_packets(u64 packets, struct ip_set_counter *counter) -{ - atomic64_add((long long)packets, &(counter)->packets); -} - -static inline u64 -ip_set_get_bytes(const struct ip_set_counter *counter) -{ - return (u64)atomic64_read(&(counter)->bytes); -} - -static inline u64 -ip_set_get_packets(const struct ip_set_counter *counter) -{ - return (u64)atomic64_read(&(counter)->packets); -} - -static inline bool -ip_set_match_counter(u64 counter, u64 match, u8 op) -{ - switch (op) { - case IPSET_COUNTER_NONE: - return true; - case IPSET_COUNTER_EQ: - return counter == match; - case IPSET_COUNTER_NE: - return counter != match; - case IPSET_COUNTER_LT: - return counter < match; - case IPSET_COUNTER_GT: - return counter > match; - } - return false; -} - -static inline void -ip_set_update_counter(struct ip_set_counter *counter, - const struct ip_set_ext *ext, u32 flags) -{ - if (ext->packets != ULLONG_MAX && - !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) { - ip_set_add_bytes(ext->bytes, counter); - ip_set_add_packets(ext->packets, counter); - } -} - -static inline bool -ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter) -{ - return nla_put_net64(skb, IPSET_ATTR_BYTES, - cpu_to_be64(ip_set_get_bytes(counter)), - IPSET_ATTR_PAD) || - nla_put_net64(skb, IPSET_ATTR_PACKETS, - cpu_to_be64(ip_set_get_packets(counter)), - IPSET_ATTR_PAD); -} - static inline void ip_set_init_counter(struct ip_set_counter *counter, const struct ip_set_ext *ext) @@ -598,31 +521,6 @@ ip_set_init_counter(struct ip_set_counter *counter, atomic64_set(&(counter)->packets, (long long)(ext->packets)); } -static inline void -ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo, - const struct ip_set_ext *ext, - struct ip_set_ext *mext, u32 flags) -{ - mext->skbinfo = *skbinfo; -} - -static inline bool -ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo) -{ - /* Send nonzero parameters only */ - return ((skbinfo->skbmark || skbinfo->skbmarkmask) && - nla_put_net64(skb, IPSET_ATTR_SKBMARK, - cpu_to_be64((u64)skbinfo->skbmark << 32 | - skbinfo->skbmarkmask), - IPSET_ATTR_PAD)) || - (skbinfo->skbprio && - nla_put_net32(skb, IPSET_ATTR_SKBPRIO, - cpu_to_be32(skbinfo->skbprio))) || - (skbinfo->skbqueue && - nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, - cpu_to_be16(skbinfo->skbqueue))); -} - static inline void ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo, const struct ip_set_ext *ext) -- cgit v1.2.3 From 856391854ce73015fbe2b235f5886205aab166b0 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Thu, 3 Oct 2019 20:56:05 +0100 Subject: netfilter: ipset: make ip_set_put_flags extern. ip_set_put_flags is rather large for a static inline function in a header-file. Move it to ip_set_core.c and export it. Signed-off-by: Jeremy Sowden Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 44f6de8a1733..4d8b1eaf7708 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -276,28 +276,7 @@ ip_set_ext_destroy(struct ip_set *set, void *data) } } -static inline int -ip_set_put_flags(struct sk_buff *skb, struct ip_set *set) -{ - u32 cadt_flags = 0; - - if (SET_WITH_TIMEOUT(set)) - if (unlikely(nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(set->timeout)))) - return -EMSGSIZE; - if (SET_WITH_COUNTER(set)) - cadt_flags |= IPSET_FLAG_WITH_COUNTERS; - if (SET_WITH_COMMENT(set)) - cadt_flags |= IPSET_FLAG_WITH_COMMENT; - if (SET_WITH_SKBINFO(set)) - cadt_flags |= IPSET_FLAG_WITH_SKBINFO; - if (SET_WITH_FORCEADD(set)) - cadt_flags |= IPSET_FLAG_WITH_FORCEADD; - - if (!cadt_flags) - return 0; - return nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(cadt_flags)); -} +int ip_set_put_flags(struct sk_buff *skb, struct ip_set *set); /* Netlink CB args */ enum { -- cgit v1.2.3 From 3fbd6c4513b5c27465a1dcf2e4286e6c3183bb1f Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Thu, 3 Oct 2019 20:56:06 +0100 Subject: netfilter: ipset: move function to ip_set_bitmap_ip.c. One inline function in ip_set_bitmap.h is only called in ip_set_bitmap_ip.c: move it and remove inline function specifier. Signed-off-by: Jeremy Sowden Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set_bitmap.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set_bitmap.h b/include/linux/netfilter/ipset/ip_set_bitmap.h index 2dddbc6dcac7..fcc4d214a788 100644 --- a/include/linux/netfilter/ipset/ip_set_bitmap.h +++ b/include/linux/netfilter/ipset/ip_set_bitmap.h @@ -12,18 +12,4 @@ enum { IPSET_ADD_START_STORED_TIMEOUT, }; -/* Common functions */ - -static inline u32 -range_to_mask(u32 from, u32 to, u8 *bits) -{ - u32 mask = 0xFFFFFFFE; - - *bits = 32; - while (--(*bits) > 0 && mask && (to & mask) != from) - mask <<= 1; - - return mask; -} - #endif /* __IP_SET_BITMAP_H */ -- cgit v1.2.3 From f8615bf8a3dabd84bf844c6f888929495039d389 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Thu, 3 Oct 2019 20:56:07 +0100 Subject: netfilter: ipset: move ip_set_get_ip_port() to ip_set_bitmap_port.c. ip_set_get_ip_port() is only used in ip_set_bitmap_port.c. Move it there and make it static. Signed-off-by: Jeremy Sowden Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set_getport.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set_getport.h b/include/linux/netfilter/ipset/ip_set_getport.h index d74cd112b88a..1ecaabd9a048 100644 --- a/include/linux/netfilter/ipset/ip_set_getport.h +++ b/include/linux/netfilter/ipset/ip_set_getport.h @@ -20,9 +20,6 @@ static inline bool ip_set_get_ip6_port(const struct sk_buff *skb, bool src, } #endif -extern bool ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, - __be16 *port); - static inline bool ip_set_proto_with_ports(u8 proto) { switch (proto) { -- cgit v1.2.3 From 7d47433cf74f942a414171867d89c08640cfef45 Mon Sep 17 00:00:00 2001 From: Yamin Friedman Date: Mon, 7 Oct 2019 16:59:31 +0300 Subject: net/mlx5: Expose optimal performance scatter entries capability Expose maximum scatter entries per RDMA READ for optimal performance. Signed-off-by: Yamin Friedman Reviewed-by: Or Gerlitz Reviewed-by: Christoph Hellwig Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 138c50d5a353..c0bfb1d90dd2 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1153,7 +1153,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_srq[0x5]; u8 reserved_at_b0[0x10]; - u8 reserved_at_c0[0x8]; + u8 max_sgl_for_optimized_performance[0x8]; u8 log_max_cq_sz[0x8]; u8 reserved_at_d0[0xb]; u8 log_max_cq[0x5]; -- cgit v1.2.3 From 5d5a0815f854a5b0e21d97e16cfadad69ce5fb04 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Fri, 27 Sep 2019 12:54:50 +0800 Subject: ipvs: batch __ip_vs_cleanup It's better to batch __ip_vs_cleanup to speedup ipvs connections dismantle. Signed-off-by: Haishuang Yan Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 3759167f91f5..93e7a252993d 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1324,7 +1324,7 @@ void ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_control_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs); -void ip_vs_service_net_cleanup(struct netns_ipvs *ipvs); +void ip_vs_service_nets_cleanup(struct list_head *net_list); /* IPVS application functions * (from ip_vs_app.c) -- cgit v1.2.3 From 79591b7db21d255db158afaa48c557dcab631a1c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 5 Sep 2019 04:01:12 +0300 Subject: spi: Add a PTP system timestamp to the transfer structure SPI is one of the interfaces used to access devices which have a POSIX clock driver (real time clocks, 1588 timers etc). The fact that the SPI bus is slow is not what the main problem is, but rather the fact that drivers don't take a constant amount of time in transferring data over SPI. When there is a high delay in the readout of time, there will be uncertainty in the value that has been read out of the peripheral. When that delay is constant, the uncertainty can at least be approximated with a certain accuracy which is fine more often than not. Timing jitter occurs all over in the kernel code, and is mainly caused by having to let go of the CPU for various reasons such as preemption, servicing interrupts, going to sleep, etc. Another major reason is CPU dynamic frequency scaling. It turns out that the problem of retrieving time from a SPI peripheral with high accuracy can be solved by the use of "PTP system timestamping" - a mechanism to correlate the time when the device has snapshotted its internal time counter with the Linux system time at that same moment. This is sufficient for having a precise time measurement - it is not necessary for the whole SPI transfer to be transmitted "as fast as possible", or "as low-jitter as possible". The system has to be low-jitter for a very short amount of time to be effective. This patch introduces a PTP system timestamping mechanism in struct spi_transfer. This is to be used by SPI device drivers when they need to know the exact time at which the underlying device's time was snapshotted. More often than not, SPI peripherals have a very exact timing for when their SPI-to-interconnect bridge issues a transaction for snapshotting and reading the time register, and that will be dependent on when the SPI-to-interconnect bridge figures out that this is what it should do, aka as soon as it sees byte N of the SPI transfer. Since spi_device drivers are the ones who'd know best how the peripheral behaves in this regard, expose a mechanism in spi_transfer which allows them to specify which word (or word range) from the transfer should be timestamped. Add a default implementation of the PTP system timestamping in the SPI core. This is not going to be satisfactory performance-wise, but should at least increase the likelihood that SPI device drivers will use PTP system timestamping in the future. There are 3 entry points from the core towards the SPI controller drivers: - transfer_one: The driver is passed individual spi_transfers to execute. This is the easiest to timestamp. - transfer_one_message: The core passes the driver an entire spi_message (a potential batch of spi_transfers). The core puts the same pre and post timestamp to all transfers within a message. This is not ideal, but nothing better can be done by default anyway, since the core has no insight into how the driver batches the transfers. - transfer: Like transfer_one_message, but for unqueued drivers (i.e. the driver implements its own queue scheduling). Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20190905010114.26718-3-olteanv@gmail.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 61 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'include') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index af4f265d0f67..27f6b046cf92 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -13,6 +13,7 @@ #include #include #include +#include struct dma_chan; struct property_entry; @@ -409,6 +410,12 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv) * @fw_translate_cs: If the boot firmware uses different numbering scheme * what Linux expects, this optional hook can be used to translate * between the two. + * @ptp_sts_supported: If the driver sets this to true, it must provide a + * time snapshot in @spi_transfer->ptp_sts as close as possible to the + * moment in time when @spi_transfer->ptp_sts_word_pre and + * @spi_transfer->ptp_sts_word_post were transmitted. + * If the driver does not set this, the SPI core takes the snapshot as + * close to the driver hand-over as possible. * * Each SPI controller can communicate with one or more @spi_device * children. These make a small bus, sharing MOSI, MISO and SCK signals @@ -604,6 +611,15 @@ struct spi_controller { void *dummy_tx; int (*fw_translate_cs)(struct spi_controller *ctlr, unsigned cs); + + /* + * Driver sets this field to indicate it is able to snapshot SPI + * transfers (needed e.g. for reading the time of POSIX clocks) + */ + bool ptp_sts_supported; + + /* Interrupt enable state during PTP system timestamping */ + unsigned long irq_flags; }; static inline void *spi_controller_get_devdata(struct spi_controller *ctlr) @@ -644,6 +660,14 @@ extern struct spi_message *spi_get_next_queued_message(struct spi_controller *ct extern void spi_finalize_current_message(struct spi_controller *ctlr); extern void spi_finalize_current_transfer(struct spi_controller *ctlr); +/* Helper calls for driver to timestamp transfer */ +void spi_take_timestamp_pre(struct spi_controller *ctlr, + struct spi_transfer *xfer, + const void *tx, bool irqs_off); +void spi_take_timestamp_post(struct spi_controller *ctlr, + struct spi_transfer *xfer, + const void *tx, bool irqs_off); + /* the spi driver core manages memory for the spi_controller classdev */ extern struct spi_controller *__spi_alloc_controller(struct device *host, unsigned int size, bool slave); @@ -753,6 +777,35 @@ extern void spi_res_release(struct spi_controller *ctlr, * @transfer_list: transfers are sequenced through @spi_message.transfers * @tx_sg: Scatterlist for transmit, currently not for client use * @rx_sg: Scatterlist for receive, currently not for client use + * @ptp_sts_word_pre: The word (subject to bits_per_word semantics) offset + * within @tx_buf for which the SPI device is requesting that the time + * snapshot for this transfer begins. Upon completing the SPI transfer, + * this value may have changed compared to what was requested, depending + * on the available snapshotting resolution (DMA transfer, + * @ptp_sts_supported is false, etc). + * @ptp_sts_word_post: See @ptp_sts_word_post. The two can be equal (meaning + * that a single byte should be snapshotted). + * If the core takes care of the timestamp (if @ptp_sts_supported is false + * for this controller), it will set @ptp_sts_word_pre to 0, and + * @ptp_sts_word_post to the length of the transfer. This is done + * purposefully (instead of setting to spi_transfer->len - 1) to denote + * that a transfer-level snapshot taken from within the driver may still + * be of higher quality. + * @ptp_sts: Pointer to a memory location held by the SPI slave device where a + * PTP system timestamp structure may lie. If drivers use PIO or their + * hardware has some sort of assist for retrieving exact transfer timing, + * they can (and should) assert @ptp_sts_supported and populate this + * structure using the ptp_read_system_*ts helper functions. + * The timestamp must represent the time at which the SPI slave device has + * processed the word, i.e. the "pre" timestamp should be taken before + * transmitting the "pre" word, and the "post" timestamp after receiving + * transmit confirmation from the controller for the "post" word. + * @timestamped_pre: Set by the SPI controller driver to denote it has acted + * upon the @ptp_sts request. Not set when the SPI core has taken care of + * the task. SPI device drivers are free to print a warning if this comes + * back unset and they need the better resolution. + * @timestamped_post: See above. The reason why both exist is that these + * booleans are also used to keep state in the core SPI logic. * * SPI transfers always write the same number of bytes as they read. * Protocol drivers should always provide @rx_buf and/or @tx_buf. @@ -842,6 +895,14 @@ struct spi_transfer { u32 effective_speed_hz; + unsigned int ptp_sts_word_pre; + unsigned int ptp_sts_word_post; + + struct ptp_system_timestamp *ptp_sts; + + bool timestamped_pre; + bool timestamped_post; + struct list_head transfer_list; }; -- cgit v1.2.3 From bacb7e1855969bba78b32302453d2cc8ba0bc403 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Oct 2019 14:20:34 -0700 Subject: Revert "tun: call dev_get_valid_name() before register_netdevice()" This reverts commit 0ad646c81b2182f7fa67ec0c8c825e0ee165696d. As noticed by Jakub, this is no longer needed after commit 11fc7d5a0a2d ("tun: fix memory leak in error path") This no longer exports dev_get_valid_name() for the exclusive use of tun driver. Suggested-by: Jakub Kicinski Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fe45b2c72315..3207e0b9ec4e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4113,9 +4113,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs); -int dev_get_valid_name(struct net *net, struct net_device *dev, - const char *name); - #define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \ alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1) -- cgit v1.2.3 From fd1ac07f3f17fbbc2f08e3b43951bed937d86a7b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 4 Oct 2019 00:21:57 +0300 Subject: xfrm: ifdef setsockopt(UDP_ENCAP_ESPINUDP/UDP_ENCAP_ESPINUDP_NON_IKE) If IPsec is not configured, there is no reason to delay the inevitable. Signed-off-by: Alexey Dobriyan Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index aa08a7a5f6ac..dda3c025452e 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1613,13 +1613,6 @@ static inline int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optv { return -ENOPROTOOPT; } - -static inline int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) -{ - /* should not happen */ - kfree_skb(skb); - return 0; -} #endif struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, -- cgit v1.2.3 From 4b7740324ed86aa4b02cef134da4b79078294d72 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 8 Oct 2019 19:27:33 +0800 Subject: sctp: add SCTP_ADDR_ADDED event A helper sctp_ulpevent_nofity_peer_addr_change() will be extracted to make peer_addr_change event and enqueue it, and the helper will be called in sctp_assoc_add_peer() to send SCTP_ADDR_ADDED event. This event is described in rfc6458#section-6.1.2: SCTP_ADDR_ADDED: The address is now part of the association. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: Jakub Kicinski --- include/net/sctp/ulpevent.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index e1a92c4610f3..e6ead1ed74dd 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -80,13 +80,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_assoc_change( struct sctp_chunk *chunk, gfp_t gfp); -struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( - const struct sctp_association *asoc, - const struct sockaddr_storage *aaddr, - int flags, - int state, - int error, - gfp_t gfp); +void sctp_ulpevent_nofity_peer_addr_change(struct sctp_transport *transport, + int state, int error); struct sctp_ulpevent *sctp_ulpevent_make_remote_error( const struct sctp_association *asoc, -- cgit v1.2.3 From b6e6b5f1da7e8d092f86a4351802c27c0170c5a5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 8 Oct 2019 19:27:36 +0800 Subject: sctp: add SCTP_SEND_FAILED_EVENT event This patch is to add a new event SCTP_SEND_FAILED_EVENT described in rfc6458#section-6.1.11. It's a update of SCTP_SEND_FAILED event: struct sctp_sndrcvinfo ssf_info is replaced with struct sctp_sndinfo ssfe_info in struct sctp_send_failed_event. SCTP_SEND_FAILED is being deprecated, but we don't remove it in this patch. Both are being processed in sctp_datamsg_destroy() when the corresp event flag is set. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: Jakub Kicinski --- include/net/sctp/ulpevent.h | 7 +++++++ include/uapi/linux/sctp.h | 16 +++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index e6ead1ed74dd..0b032b92da0b 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -95,6 +95,13 @@ struct sctp_ulpevent *sctp_ulpevent_make_send_failed( __u32 error, gfp_t gfp); +struct sctp_ulpevent *sctp_ulpevent_make_send_failed_event( + const struct sctp_association *asoc, + struct sctp_chunk *chunk, + __u16 flags, + __u32 error, + gfp_t gfp); + struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event( const struct sctp_association *asoc, __u16 flags, diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 6d5b164af55c..6bce7f9837a9 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -449,6 +449,16 @@ struct sctp_send_failed { __u8 ssf_data[0]; }; +struct sctp_send_failed_event { + __u16 ssf_type; + __u16 ssf_flags; + __u32 ssf_length; + __u32 ssf_error; + struct sctp_sndinfo ssfe_info; + sctp_assoc_t ssf_assoc_id; + __u8 ssf_data[0]; +}; + /* * ssf_flags: 16 bits (unsigned integer) * @@ -605,6 +615,7 @@ struct sctp_event_subscribe { __u8 sctp_stream_reset_event; __u8 sctp_assoc_reset_event; __u8 sctp_stream_change_event; + __u8 sctp_send_failure_event_event; }; /* @@ -632,6 +643,7 @@ union sctp_notification { struct sctp_stream_reset_event sn_strreset_event; struct sctp_assoc_reset_event sn_assocreset_event; struct sctp_stream_change_event sn_strchange_event; + struct sctp_send_failed_event sn_send_failed_event; }; /* Section 5.3.1 @@ -667,7 +679,9 @@ enum sctp_sn_type { #define SCTP_ASSOC_RESET_EVENT SCTP_ASSOC_RESET_EVENT SCTP_STREAM_CHANGE_EVENT, #define SCTP_STREAM_CHANGE_EVENT SCTP_STREAM_CHANGE_EVENT - SCTP_SN_TYPE_MAX = SCTP_STREAM_CHANGE_EVENT, + SCTP_SEND_FAILED_EVENT, +#define SCTP_SEND_FAILED_EVENT SCTP_SEND_FAILED_EVENT + SCTP_SN_TYPE_MAX = SCTP_SEND_FAILED_EVENT, #define SCTP_SN_TYPE_MAX SCTP_SN_TYPE_MAX }; -- cgit v1.2.3 From 690a6ca7df3de7b90546bc10a620d1ac8ccaa1a1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 8 Oct 2019 21:03:14 -0700 Subject: DIM: fix dim.h kernel-doc and headers Lots of fixes to kernel-doc in structs, enums, and functions. Also add header files that are being used but not yet #included. Signed-off-by: Randy Dunlap Cc: Yamin Friedman Cc: Tal Gilboa Cc: Saeed Mahameed Cc: Doug Ledford Cc: Jason Gunthorpe Cc: linux-rdma@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Jakub Kicinski --- include/linux/dim.h | 63 ++++++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/dim.h b/include/linux/dim.h index 9fa4b3f88c39..b698266d0035 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -4,22 +4,26 @@ #ifndef DIM_H #define DIM_H +#include +#include #include +#include +#include -/** +/* * Number of events between DIM iterations. * Causes a moderation of the algorithm run. */ #define DIM_NEVENTS 64 -/** +/* * Is a difference between values justifies taking an action. * We consider 10% difference as significant. */ #define IS_SIGNIFICANT_DIFF(val, ref) \ (((100UL * abs((val) - (ref))) / (ref)) > 10) -/** +/* * Calculate the gap between two values. * Take wrap-around and variable size into consideration. */ @@ -27,12 +31,13 @@ & (BIT_ULL(bits) - 1)) /** - * Structure for CQ moderation values. + * struct dim_cq_moder - Structure for CQ moderation values. * Used for communications between DIM and its consumer. * * @usec: CQ timer suggestion (by DIM) * @pkts: CQ packet counter suggestion (by DIM) - * @cq_period_mode: CQ priod count mode (from CQE/EQE) + * @comps: Completion counter + * @cq_period_mode: CQ period count mode (from CQE/EQE) */ struct dim_cq_moder { u16 usec; @@ -42,13 +47,14 @@ struct dim_cq_moder { }; /** - * Structure for DIM sample data. + * struct dim_sample - Structure for DIM sample data. * Used for communications between DIM and its consumer. * * @time: Sample timestamp * @pkt_ctr: Number of packets * @byte_ctr: Number of bytes * @event_ctr: Number of events + * @comp_ctr: Current completion counter */ struct dim_sample { ktime_t time; @@ -59,12 +65,14 @@ struct dim_sample { }; /** - * Structure for DIM stats. + * struct dim_stats - Structure for DIM stats. * Used for holding current measured rates. * * @ppms: Packets per msec * @bpms: Bytes per msec * @epms: Events per msec + * @cpms: Completions per msec + * @cpe_ratio: Ratio of completions to events */ struct dim_stats { int ppms; /* packets per msec */ @@ -75,12 +83,13 @@ struct dim_stats { }; /** - * Main structure for dynamic interrupt moderation (DIM). + * struct dim - Main structure for dynamic interrupt moderation (DIM). * Used for holding all information about a specific DIM instance. * * @state: Algorithm state (see below) * @prev_stats: Measured rates from previous iteration (for comparison) * @start_sample: Sampled data at start of current iteration + * @measuring_sample: A &dim_sample that is used to update the current events * @work: Work to perform on action required * @priv: A pointer to the struct that points to dim * @profile_ix: Current moderation profile @@ -106,24 +115,21 @@ struct dim { }; /** - * enum dim_cq_period_mode - * - * These are the modes for CQ period count. + * enum dim_cq_period_mode - Modes for CQ period count * * @DIM_CQ_PERIOD_MODE_START_FROM_EQE: Start counting from EQE * @DIM_CQ_PERIOD_MODE_START_FROM_CQE: Start counting from CQE (implies timer reset) * @DIM_CQ_PERIOD_NUM_MODES: Number of modes */ -enum { +enum dim_cq_period_mode { DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0, DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1, DIM_CQ_PERIOD_NUM_MODES }; /** - * enum dim_state + * enum dim_state - DIM algorithm states * - * These are the DIM algorithm states. * These will determine if the algorithm is in a valid state to start an iteration. * * @DIM_START_MEASURE: This is the first iteration (also after applying a new profile) @@ -131,16 +137,15 @@ enum { * need to perform an action * @DIM_APPLY_NEW_PROFILE: DIM consumer is currently applying a profile - no need to measure */ -enum { +enum dim_state { DIM_START_MEASURE, DIM_MEASURE_IN_PROGRESS, DIM_APPLY_NEW_PROFILE, }; /** - * enum dim_tune_state + * enum dim_tune_state - DIM algorithm tune states * - * These are the DIM algorithm tune states. * These will determine which action the algorithm should perform. * * @DIM_PARKING_ON_TOP: Algorithm found a local top point - exit on significant difference @@ -148,7 +153,7 @@ enum { * @DIM_GOING_RIGHT: Algorithm is currently trying higher moderation levels * @DIM_GOING_LEFT: Algorithm is currently trying lower moderation levels */ -enum { +enum dim_tune_state { DIM_PARKING_ON_TOP, DIM_PARKING_TIRED, DIM_GOING_RIGHT, @@ -156,25 +161,23 @@ enum { }; /** - * enum dim_stats_state + * enum dim_stats_state - DIM algorithm statistics states * - * These are the DIM algorithm statistics states. * These will determine the verdict of current iteration. * * @DIM_STATS_WORSE: Current iteration shows worse performance than before - * @DIM_STATS_WORSE: Current iteration shows same performance than before - * @DIM_STATS_WORSE: Current iteration shows better performance than before + * @DIM_STATS_SAME: Current iteration shows same performance than before + * @DIM_STATS_BETTER: Current iteration shows better performance than before */ -enum { +enum dim_stats_state { DIM_STATS_WORSE, DIM_STATS_SAME, DIM_STATS_BETTER, }; /** - * enum dim_step_result + * enum dim_step_result - DIM algorithm step results * - * These are the DIM algorithm step results. * These describe the result of a step. * * @DIM_STEPPED: Performed a regular step @@ -182,7 +185,7 @@ enum { * tired parking * @DIM_ON_EDGE: Stepped to the most left/right profile */ -enum { +enum dim_step_result { DIM_STEPPED, DIM_TOO_TIRED, DIM_ON_EDGE, @@ -199,7 +202,7 @@ enum { bool dim_on_top(struct dim *dim); /** - * dim_turn - change profile alterning direction + * dim_turn - change profile altering direction * @dim: DIM context * * Go left if we were going right and vice-versa. @@ -238,7 +241,7 @@ void dim_calc_stats(struct dim_sample *start, struct dim_sample *end, struct dim_stats *curr_stats); /** - * dim_update_sample - set a sample's fields with give values + * dim_update_sample - set a sample's fields with given values * @event_ctr: number of events to set * @packets: number of packets to set * @bytes: number of bytes to set @@ -304,8 +307,8 @@ struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode); * @end_sample: Current data measurement * * Called by the consumer. - * This is the main logic of the algorithm, where data is processed in order to decide on next - * required action. + * This is the main logic of the algorithm, where data is processed in order + * to decide on next required action. */ void net_dim(struct dim *dim, struct dim_sample end_sample); -- cgit v1.2.3 From a2351c5d86d7acf8eef17fba4ac1fc5b305a37c0 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 9 Oct 2019 10:07:43 +0200 Subject: net/smc: separate SMCD and SMCR link group lists Currently SMCD and SMCR link groups are maintained in one list. To facilitate abnormal termination handling they are split into a separate list for SMCR link groups and separate lists for SMCD link groups per SMCD device. Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski --- include/net/smc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/smc.h b/include/net/smc.h index bd9c0fb3b577..c08e8c415673 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -75,6 +75,7 @@ struct smcd_dev { struct workqueue_struct *event_wq; u8 pnetid[SMC_MAX_PNETID_LEN]; bool pnetid_by_user; + struct list_head lgr_list; }; struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, -- cgit v1.2.3 From a0a62ee15a829ebf8aeec55a4f1688230439b3e0 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 9 Oct 2019 10:07:44 +0200 Subject: net/smc: separate locks for SMCD and SMCR link group lists This patch introduces separate locks for the split SMCD and SMCR link group lists. Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski --- include/net/smc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/smc.h b/include/net/smc.h index c08e8c415673..438bb0261f45 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -76,6 +76,7 @@ struct smcd_dev { u8 pnetid[SMC_MAX_PNETID_LEN]; bool pnetid_by_user; struct list_head lgr_list; + spinlock_t lgr_lock; }; struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, -- cgit v1.2.3 From c3d9494e68c4a5d23227ede822fda9bd68bef8e3 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 9 Oct 2019 10:07:46 +0200 Subject: net/smc: no new connections on disappearing devices Add a "going_away" indication to ISM devices and IB ports and avoid creation of new connections on such disappearing devices. And do not handle ISM events if ISM device is disappearing. Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski --- include/net/smc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/smc.h b/include/net/smc.h index 438bb0261f45..05174ae4f325 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -77,6 +77,7 @@ struct smcd_dev { bool pnetid_by_user; struct list_head lgr_list; spinlock_t lgr_lock; + u8 going_away : 1; }; struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, -- cgit v1.2.3 From 84a081f60db63aaae3665118203506aa09a7f94f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 11 Oct 2019 11:11:40 -0700 Subject: bpf: Align struct bpf_prog_stats Do not risk spanning these small structures on two cache lines. Signed-off-by: Eric Dumazet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191011181140.2898-1-edumazet@google.com --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5b9d22338606..282e28bf41ec 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -363,7 +363,7 @@ struct bpf_prog_stats { u64 cnt; u64 nsecs; struct u64_stats_sync syncp; -}; +} __aligned(2 * sizeof(u64)); struct bpf_prog_aux { atomic_t refcnt; -- cgit v1.2.3 From e7a981050a7fb9a14b652365c00d9c5a025704ce Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 10 Oct 2019 15:18:49 +0200 Subject: devlink: propagate extack down to health reporter ops During health reporter operations, driver might want to fill-up the extack message, so propagate extack down to the health reporter ops. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 4095657fc23f..6bf3b9e0595a 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -507,11 +507,13 @@ enum devlink_health_reporter_state { struct devlink_health_reporter_ops { char *name; int (*recover)(struct devlink_health_reporter *reporter, - void *priv_ctx); + void *priv_ctx, struct netlink_ext_ack *extack); int (*dump)(struct devlink_health_reporter *reporter, - struct devlink_fmsg *fmsg, void *priv_ctx); + struct devlink_fmsg *fmsg, void *priv_ctx, + struct netlink_ext_ack *extack); int (*diagnose)(struct devlink_health_reporter *reporter, - struct devlink_fmsg *fmsg); + struct devlink_fmsg *fmsg, + struct netlink_ext_ack *extack); }; /** -- cgit v1.2.3 From 14af7fd1d4279c8db7fbbb3ca0df3b13179eb502 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 12 Oct 2019 18:27:57 +0200 Subject: ethtool: Add support for 400Gbps (50Gbps per lane) link modes Add support for 400Gbps speed, link modes of 50Gbps per lane Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 8938b76c4ee3..d4591792f0b4 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1507,6 +1507,11 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT = 66, ETHTOOL_LINK_MODE_100baseT1_Full_BIT = 67, ETHTOOL_LINK_MODE_1000baseT1_Full_BIT = 68, + ETHTOOL_LINK_MODE_400000baseKR8_Full_BIT = 69, + ETHTOOL_LINK_MODE_400000baseSR8_Full_BIT = 70, + ETHTOOL_LINK_MODE_400000baseLR8_ER8_FR8_Full_BIT = 71, + ETHTOOL_LINK_MODE_400000baseDR8_Full_BIT = 72, + ETHTOOL_LINK_MODE_400000baseCR8_Full_BIT = 73, /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS @@ -1618,6 +1623,7 @@ enum ethtool_link_mode_bit_indices { #define SPEED_56000 56000 #define SPEED_100000 100000 #define SPEED_200000 200000 +#define SPEED_400000 400000 #define SPEED_UNKNOWN -1 -- cgit v1.2.3 From 554032cdfbf4491f38241a3f6b27459408d90df3 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 15 Oct 2019 11:28:46 +0100 Subject: net: phylink: use more linkmode_* Use more linkmode_* helpers rather than open-coding the bitmap operations. Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/linkmode.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h index a99c58866860..fe740031339d 100644 --- a/include/linux/linkmode.h +++ b/include/linux/linkmode.h @@ -82,4 +82,10 @@ static inline int linkmode_equal(const unsigned long *src1, return bitmap_equal(src1, src2, __ETHTOOL_LINK_MODE_MASK_NBITS); } +static inline int linkmode_subset(const unsigned long *src1, + const unsigned long *src2) +{ + return bitmap_subset(src1, src2, __ETHTOOL_LINK_MODE_MASK_NBITS); +} + #endif /* __LINKMODE_H */ -- cgit v1.2.3 From 2203cbf2c8b58a1e3bef98c47531d431d11639a0 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 15 Oct 2019 11:38:39 +0100 Subject: net: sfp: move fwnode parsing into sfp-bus layer Rather than parsing the sfp firmware node in phylink, parse it in the sfp-bus code, so we can re-use this code for PHYs without having to duplicate the parsing. Signed-off-by: Russell King Signed-off-by: David S. Miller --- include/linux/sfp.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 1c35428e98bc..355a08a76fd4 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -508,9 +508,9 @@ int sfp_get_module_eeprom(struct sfp_bus *bus, struct ethtool_eeprom *ee, u8 *data); void sfp_upstream_start(struct sfp_bus *bus); void sfp_upstream_stop(struct sfp_bus *bus); -struct sfp_bus *sfp_register_upstream(struct fwnode_handle *fwnode, - void *upstream, - const struct sfp_upstream_ops *ops); +struct sfp_bus *sfp_register_upstream_node(struct fwnode_handle *fwnode, + void *upstream, + const struct sfp_upstream_ops *ops); void sfp_unregister_upstream(struct sfp_bus *bus); #else static inline int sfp_parse_port(struct sfp_bus *bus, @@ -553,11 +553,11 @@ static inline void sfp_upstream_stop(struct sfp_bus *bus) { } -static inline struct sfp_bus *sfp_register_upstream( +static inline struct sfp_bus *sfp_register_upstream_node( struct fwnode_handle *fwnode, void *upstream, const struct sfp_upstream_ops *ops) { - return (struct sfp_bus *)-1; + return NULL; } static inline void sfp_unregister_upstream(struct sfp_bus *bus) -- cgit v1.2.3 From 2ad9d7747c10d17cc06447944fefd4c29ae11eb1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 15 Oct 2019 15:19:15 +0200 Subject: netfilter: conntrack: free extension area immediately Instead of waiting for rcu grace period just free it directly. This is safe because conntrack lookup doesn't consider extensions. Other accesses happen while ct->ext can't be free'd, either because a ct refcount was taken or because the conntrack hash bucket lock or the dying list spinlock have been taken. This allows to remove __krealloc in a followup patch, netfilter was the only user. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_extend.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 112a6f40dfaf..5ae5295aa46d 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -43,7 +43,6 @@ enum nf_ct_ext_id { /* Extensions: optional stuff which isn't permanently in struct. */ struct nf_ct_ext { - struct rcu_head rcu; u8 offset[NF_CT_EXT_NUM]; u8 len; char data[0]; @@ -72,15 +71,6 @@ static inline void *__nf_ct_ext_find(const struct nf_conn *ct, u8 id) /* Destroy all relationships */ void nf_ct_ext_destroy(struct nf_conn *ct); -/* Free operation. If you want to free a object referred from private area, - * please implement __nf_ct_ext_free() and call it. - */ -static inline void nf_ct_ext_free(struct nf_conn *ct) -{ - if (ct->ext) - kfree_rcu(ct->ext, rcu); -} - /* Add this type, returns pointer to data or NULL. */ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp); -- cgit v1.2.3 From ca58fbe06c54795f00db79e447f94c2028d30124 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 11 Oct 2019 00:30:37 +0200 Subject: netfilter: add and use nf_hook_slow_list() At this time, NF_HOOK_LIST() macro will iterate the list and then calls nf_hook() for each individual skb. This makes it so the entire list is passed into the netfilter core. The advantage is that we only need to fetch the rule blob once per list instead of per-skb. NF_HOOK_LIST now only works for ipv4 and ipv6, as those are the only callers. v2: use skb_list_del_init() instead of list_del (Edward Cree) Signed-off-by: Florian Westphal Acked-by: Edward Cree Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 77ebb61faf48..eb312e7ca36e 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -199,6 +199,8 @@ extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, const struct nf_hook_entries *e, unsigned int i); +void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, + const struct nf_hook_entries *e); /** * nf_hook - call a netfilter hook * @@ -311,17 +313,36 @@ NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct list_head *head, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { - struct sk_buff *skb, *next; - struct list_head sublist; - - INIT_LIST_HEAD(&sublist); - list_for_each_entry_safe(skb, next, head, list) { - list_del(&skb->list); - if (nf_hook(pf, hook, net, sk, skb, in, out, okfn) == 1) - list_add_tail(&skb->list, &sublist); + struct nf_hook_entries *hook_head = NULL; + +#ifdef CONFIG_JUMP_LABEL + if (__builtin_constant_p(pf) && + __builtin_constant_p(hook) && + !static_key_false(&nf_hooks_needed[pf][hook])) + return; +#endif + + rcu_read_lock(); + switch (pf) { + case NFPROTO_IPV4: + hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); + break; + case NFPROTO_IPV6: + hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]); + break; + default: + WARN_ON_ONCE(1); + break; } - /* Put passed packets back on main list */ - list_splice(&sublist, head); + + if (hook_head) { + struct nf_hook_state state; + + nf_hook_state_init(&state, hook, pf, in, out, sk, net, okfn); + + nf_hook_slow_list(head, &state, hook_head); + } + rcu_read_unlock(); } /* Call setsockopt() */ -- cgit v1.2.3 From e8c423fb31fa8b1ef6d7cd14a168de33e7c0d702 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:24:55 -0700 Subject: bpf: Add typecast to raw_tracepoints to help BTF generation When pahole converts dwarf to btf it emits only used types. Wrap existing __bpf_trace_##template() function into btf_trace_##template typedef and use it in type cast to make gcc emits this type into dwarf. Then pahole will convert it to btf. The "btf_trace_" prefix will be used to identify BTF enabled raw tracepoints. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: John Fastabend Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-2-ast@kernel.org --- include/trace/bpf_probe.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index d6e556c0a085..b04c29270973 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -74,11 +74,12 @@ static inline void bpf_test_probe_##call(void) \ { \ check_trace_callback_type_##call(__bpf_trace_##template); \ } \ +typedef void (*btf_trace_##call)(void *__data, proto); \ static struct bpf_raw_event_map __used \ __attribute__((section("__bpf_raw_tp_map"))) \ __bpf_trace_tp_map_##call = { \ .tp = &__tracepoint_##call, \ - .bpf_func = (void *)__bpf_trace_##template, \ + .bpf_func = (void *)(btf_trace_##call)__bpf_trace_##template, \ .num_args = COUNT_ARGS(args), \ .writable_size = size, \ }; -- cgit v1.2.3 From 7c6a469e3416fa23568c2395a3faa7dd6e376dcb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:24:56 -0700 Subject: bpf: Add typecast to bpf helpers to help BTF generation When pahole converts dwarf to btf it emits only used types. Wrap existing bpf helper functions into typedef and use it in typecast to make gcc emits this type into dwarf. Then pahole will convert it to btf. The "btf_#name_of_helper" types will be used to figure out types of arguments of bpf helpers. The generated code before and after is the same. Only dwarf and btf sections are different. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: John Fastabend Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-3-ast@kernel.org --- include/linux/filter.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 2ce57645f3cd..d3d51d7aff2c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -464,10 +464,11 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) #define BPF_CALL_x(x, name, ...) \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ + typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ { \ - return ____##name(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ + return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ } \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) -- cgit v1.2.3 From 8580ac9404f6240668a026785d7d8856f0530409 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:24:57 -0700 Subject: bpf: Process in-kernel BTF If in-kernel BTF exists parse it and prepare 'struct btf *btf_vmlinux' for further use by the verifier. In-kernel BTF is trusted just like kallsyms and other build artifacts embedded into vmlinux. Yet run this BTF image through BTF verifier to make sure that it is valid and it wasn't mangled during the build. If in-kernel BTF is incorrect it means either gcc or pahole or kernel are buggy. In such case disallow loading BPF programs. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-4-ast@kernel.org --- include/linux/bpf_verifier.h | 4 +++- include/linux/btf.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 26a6d58ca78c..713efae62e96 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -330,10 +330,12 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) #define BPF_LOG_STATS 4 #define BPF_LOG_LEVEL (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2) #define BPF_LOG_MASK (BPF_LOG_LEVEL | BPF_LOG_STATS) +#define BPF_LOG_KERNEL (BPF_LOG_MASK + 1) /* kernel internal flag */ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { - return log->level && log->ubuf && !bpf_verifier_log_full(log); + return (log->level && log->ubuf && !bpf_verifier_log_full(log)) || + log->level == BPF_LOG_KERNEL; } #define BPF_MAX_SUBPROGS 256 diff --git a/include/linux/btf.h b/include/linux/btf.h index 64cdf2a23d42..55d43bc856be 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -56,6 +56,7 @@ bool btf_type_is_void(const struct btf_type *t); #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); +struct btf *btf_parse_vmlinux(void); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) -- cgit v1.2.3 From ccfe29eb29c2edcea6552072ef00ff4117f53e83 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:24:58 -0700 Subject: bpf: Add attach_btf_id attribute to program load Add attach_btf_id attribute to prog_load command. It's similar to existing expected_attach_type attribute which is used in several cgroup based program types. Unfortunately expected_attach_type is ignored for tracing programs and cannot be reused for new purpose. Hence introduce attach_btf_id to verify bpf programs against given in-kernel BTF type id at load time. It is strictly checked to be valid for raw_tp programs only. In a later patches it will become: btf_id == 0 semantics of existing raw_tp progs. btd_id > 0 raw_tp with BTF and additional type safety. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-5-ast@kernel.org --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 282e28bf41ec..f916380675dd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -375,6 +375,7 @@ struct bpf_prog_aux { u32 id; u32 func_cnt; /* used by non-func prog as the number of func progs */ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ + u32 attach_btf_id; /* in-kernel BTF type id to attach to */ bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; struct bpf_prog **func; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a65c3b0c6935..3bb2cd1de341 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -420,6 +420,7 @@ union bpf_attr { __u32 line_info_rec_size; /* userspace bpf_line_info size */ __aligned_u64 line_info; /* line info */ __u32 line_info_cnt; /* number of bpf_line_info records */ + __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ -- cgit v1.2.3 From 9e15db66136a14cde3f35691f1d839d950118826 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:00 -0700 Subject: bpf: Implement accurate raw_tp context access via BTF libbpf analyzes bpf C program, searches in-kernel BTF for given type name and stores it into expected_attach_type. The kernel verifier expects this btf_id to point to something like: typedef void (*btf_trace_kfree_skb)(void *, struct sk_buff *skb, void *loc); which represents signature of raw_tracepoint "kfree_skb". Then btf_ctx_access() matches ctx+0 access in bpf program with 'skb' and 'ctx+8' access with 'loc' arguments of "kfree_skb" tracepoint. In first case it passes btf_id of 'struct sk_buff *' back to the verifier core and 'void *' in second case. Then the verifier tracks PTR_TO_BTF_ID as any other pointer type. Like PTR_TO_SOCKET points to 'struct bpf_sock', PTR_TO_TCP_SOCK points to 'struct bpf_tcp_sock', and so on. PTR_TO_BTF_ID points to in-kernel structs. If 1234 is btf_id of 'struct sk_buff' in vmlinux's BTF then PTR_TO_BTF_ID#1234 points to one of in kernel skbs. When PTR_TO_BTF_ID#1234 is dereferenced (like r2 = *(u64 *)r1 + 32) the btf_struct_access() checks which field of 'struct sk_buff' is at offset 32. Checks that size of access matches type definition of the field and continues to track the dereferenced type. If that field was a pointer to 'struct net_device' the r2's type will be PTR_TO_BTF_ID#456. Where 456 is btf_id of 'struct net_device' in vmlinux's BTF. Such verifier analysis prevents "cheating" in BPF C program. The program cannot cast arbitrary pointer to 'struct sk_buff *' and access it. C compiler would allow type cast, of course, but the verifier will notice type mismatch based on BPF assembly and in-kernel BTF. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-7-ast@kernel.org --- include/linux/bpf.h | 17 ++++++++++++++++- include/linux/bpf_verifier.h | 4 ++++ 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f916380675dd..028555fcd10d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -16,6 +16,7 @@ #include struct bpf_verifier_env; +struct bpf_verifier_log; struct perf_event; struct bpf_prog; struct bpf_map; @@ -281,6 +282,7 @@ enum bpf_reg_type { PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ + PTR_TO_BTF_ID, /* reg points to kernel struct */ }; /* The information passed from prog-specific *_is_valid_access @@ -288,7 +290,11 @@ enum bpf_reg_type { */ struct bpf_insn_access_aux { enum bpf_reg_type reg_type; - int ctx_field_size; + union { + int ctx_field_size; + u32 btf_id; + }; + struct bpf_verifier_log *log; /* for verbose logs */ }; static inline void @@ -483,6 +489,7 @@ struct bpf_event_entry { bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); int bpf_prog_calc_tag(struct bpf_prog *fp); +const char *kernel_type_name(u32 btf_type_id); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); @@ -748,6 +755,14 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +bool btf_ctx_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info); +int btf_struct_access(struct bpf_verifier_log *log, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, + u32 *next_btf_id); + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 713efae62e96..6e7284ea1468 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -52,6 +52,8 @@ struct bpf_reg_state { */ struct bpf_map *map_ptr; + u32 btf_id; /* for PTR_TO_BTF_ID */ + /* Max size from any of the above. */ unsigned long raw; }; @@ -399,6 +401,8 @@ __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); +__printf(2, 3) void bpf_log(struct bpf_verifier_log *log, + const char *fmt, ...); static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { -- cgit v1.2.3 From 2a02759ef5f8a34792df22b41d5e10658fd7bbd3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:02 -0700 Subject: bpf: Add support for BTF pointers to interpreter Pointer to BTF object is a pointer to kernel object or NULL. The memory access in the interpreter has to be done via probe_kernel_read to avoid page faults. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-9-ast@kernel.org --- include/linux/filter.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index d3d51d7aff2c..22ebea2e64ea 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -65,6 +65,9 @@ struct ctl_table_header; /* unused opcode to mark special call to bpf_tail_call() helper */ #define BPF_TAIL_CALL 0xf0 +/* unused opcode to mark special load instruction. Same as BPF_ABS */ +#define BPF_PROBE_MEM 0x20 + /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 -- cgit v1.2.3 From 3dec541b2e632d630fe7142ed44f0b3702ef1f8c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:03 -0700 Subject: bpf: Add support for BTF pointers to x86 JIT Pointer to BTF object is a pointer to kernel object or NULL. Such pointers can only be used by BPF_LDX instructions. The verifier changed their opcode from LDX|MEM|size to LDX|PROBE_MEM|size to make JITing easier. The number of entries in extable is the number of BPF_LDX insns that access kernel memory via "pointer to BTF type". Only these load instructions can fault. Since x86 extable is relative it has to be allocated in the same memory region as JITed code. Allocate it prior to last pass of JITing and let the last pass populate it. Pointer to extable in bpf_prog_aux is necessary to make page fault handling fast. Page fault handling is done in two steps: 1. bpf_prog_kallsyms_find() finds BPF program that page faulted. It's done by walking rb tree. 2. then extable for given bpf program is binary searched. This process is similar to how page faulting is done for kernel modules. The exception handler skips over faulting x86 instruction and initializes destination register with zero. This mimics exact behavior of bpf_probe_read (when probe_kernel_read faults dest is zeroed). JITs for other architectures can add support in similar way. Until then they will reject unknown opcode and fallback to interpreter. Since extable should be aligned and placed near JITed code make bpf_jit_binary_alloc() return 4 byte aligned image offset, so that extable aligning formula in bpf_int_jit_compile() doesn't need to rely on internal implementation of bpf_jit_binary_alloc(). On x86 gcc defaults to 16-byte alignment for regular kernel functions due to better performance. JITed code may be aligned to 16 in the future, but it will use 4 in the meantime. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-10-ast@kernel.org --- include/linux/bpf.h | 3 +++ include/linux/extable.h | 10 ++++++++++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 028555fcd10d..a7330d75bb94 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -24,6 +24,7 @@ struct sock; struct seq_file; struct btf; struct btf_type; +struct exception_table_entry; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -423,6 +424,8 @@ struct bpf_prog_aux { * main prog always has linfo_idx == 0 */ u32 linfo_idx; + u32 num_exentries; + struct exception_table_entry *extable; struct bpf_prog_stats __percpu *stats; union { struct work_struct work; diff --git a/include/linux/extable.h b/include/linux/extable.h index 81ecfaa83ad3..4ab9e78f313b 100644 --- a/include/linux/extable.h +++ b/include/linux/extable.h @@ -33,4 +33,14 @@ search_module_extables(unsigned long addr) } #endif /*CONFIG_MODULES*/ +#ifdef CONFIG_BPF_JIT +const struct exception_table_entry *search_bpf_extables(unsigned long addr); +#else +static inline const struct exception_table_entry * +search_bpf_extables(unsigned long addr) +{ + return NULL; +} +#endif + #endif /* _LINUX_EXTABLE_H */ -- cgit v1.2.3 From a7658e1a4164ce2b9eb4a11aadbba38586e93bd6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:04 -0700 Subject: bpf: Check types of arguments passed into helpers Introduce new helper that reuses existing skb perf_event output implementation, but can be called from raw_tracepoint programs that receive 'struct sk_buff *' as tracepoint argument or can walk other kernel data structures to skb pointer. In order to do that teach verifier to resolve true C types of bpf helpers into in-kernel BTF ids. The type of kernel pointer passed by raw tracepoint into bpf program will be tracked by the verifier all the way until it's passed into helper function. For example: kfree_skb() kernel function calls trace_kfree_skb(skb, loc); bpf programs receives that skb pointer and may eventually pass it into bpf_skb_output() bpf helper which in-kernel is implemented via bpf_skb_event_output() kernel function. Its first argument in the kernel is 'struct sk_buff *'. The verifier makes sure that types match all the way. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-11-ast@kernel.org --- include/linux/bpf.h | 18 +++++++++++++----- include/uapi/linux/bpf.h | 27 ++++++++++++++++++++++++++- 2 files changed, 39 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a7330d75bb94..2c2c29b49845 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -213,6 +213,7 @@ enum bpf_arg_type { ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ + ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ }; /* type of values returned from helper functions */ @@ -235,11 +236,17 @@ struct bpf_func_proto { bool gpl_only; bool pkt_access; enum bpf_return_type ret_type; - enum bpf_arg_type arg1_type; - enum bpf_arg_type arg2_type; - enum bpf_arg_type arg3_type; - enum bpf_arg_type arg4_type; - enum bpf_arg_type arg5_type; + union { + struct { + enum bpf_arg_type arg1_type; + enum bpf_arg_type arg2_type; + enum bpf_arg_type arg3_type; + enum bpf_arg_type arg4_type; + enum bpf_arg_type arg5_type; + }; + enum bpf_arg_type arg_type[5]; + }; + u32 *btf_id; /* BTF ids of arguments */ }; /* bpf_context is intentionally undefined structure. Pointer to bpf_context is @@ -765,6 +772,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); +u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *, int); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3bb2cd1de341..4af8b0819a32 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2751,6 +2751,30 @@ union bpf_attr { * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies * * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 + * + * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct sk_buff. + * + * This helper is similar to **bpf_perf_event_output**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2863,7 +2887,8 @@ union bpf_attr { FN(sk_storage_get), \ FN(sk_storage_delete), \ FN(send_signal), \ - FN(tcp_gen_syncookie), + FN(tcp_gen_syncookie), \ + FN(skb_output), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 68bb8ea8ad0d497c28ed47423246b1ab20f26976 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 21 Oct 2019 16:51:15 -0400 Subject: net: dsa: use dsa_to_port helper everywhere Do not let the drivers access the ds->ports static array directly while there is a dsa_to_port helper for this purpose. At the same time, un-const this helper since the SJA1105 driver assigns the priv member of the returned dsa_port structure. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 8c3ea0530f65..2e4fe2f8962b 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -278,7 +278,7 @@ struct dsa_switch { struct dsa_port ports[]; }; -static inline const struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) +static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) { return &ds->ports[p]; } -- cgit v1.2.3 From ab8ccae122a41530a89bc899ace0e46defb156a8 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 21 Oct 2019 16:51:16 -0400 Subject: net: dsa: add ports list in the switch fabric Add a list of switch ports within the switch fabric. This will help the lookup of a port inside the whole fabric, and it is the first step towards supporting multiple CPU ports, before deprecating the usage of the unique dst->cpu_dp pointer. In preparation for a future allocation of the dsa_port structures, return -ENOMEM in case no structure is returned, even though this error cannot be reached yet. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 2e4fe2f8962b..6ff6dfcdc61d 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -125,6 +125,9 @@ struct dsa_switch_tree { */ struct dsa_port *cpu_dp; + /* List of switch ports */ + struct list_head ports; + /* * Data for the individual switch chips. */ @@ -195,6 +198,8 @@ struct dsa_port { struct work_struct xmit_work; struct sk_buff_head xmit_queue; + struct list_head list; + /* * Give the switch driver somewhere to hang its per-port private data * structures (accessible from the tagger). -- cgit v1.2.3 From b96ddf254b09447c6b79632cdc02dae3f2454a82 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 21 Oct 2019 16:51:17 -0400 Subject: net: dsa: use ports list in dsa_to_port Use the new ports list instead of accessing the dsa_switch array of ports in the dsa_to_port helper. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6ff6dfcdc61d..d2b7ee28f3fd 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -285,7 +285,14 @@ struct dsa_switch { static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) { - return &ds->ports[p]; + struct dsa_switch_tree *dst = ds->dst; + struct dsa_port *dp = NULL; + + list_for_each_entry(dp, &dst->ports, list) + if (dp->ds == ds && dp->index == p) + break; + + return dp; } static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) -- cgit v1.2.3 From fb35c60cbacc67a6075fb8e3d98fa348665662fe Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 21 Oct 2019 16:51:19 -0400 Subject: net: dsa: use ports list to setup switches Use the new ports list instead of iterating over switches and their ports when setting up the switches and their ports. At the same time, provide setup states and messages for ports and switches as it is done for the trees. Signed-off-by: Vivien Didelot Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index d2b7ee28f3fd..bd08bdee8341 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -215,9 +215,13 @@ struct dsa_port { * Original copy of the master netdev net_device_ops */ const struct net_device_ops *orig_ndo_ops; + + bool setup; }; struct dsa_switch { + bool setup; + struct device *dev; /* -- cgit v1.2.3 From da4561cda2ea6240fc61442eeb2acc47e2e0cae3 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 21 Oct 2019 16:51:24 -0400 Subject: net: dsa: use ports list to setup default CPU port Use the new ports list instead of iterating over switches and their ports when setting up the default CPU port. Unassign it on teardown. Now that we can iterate over multiple CPU ports, remove dst->cpu_dp. At the same time, provide a better error message for CPU-less tree. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index bd08bdee8341..f572134eb5de 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -120,11 +120,6 @@ struct dsa_switch_tree { */ struct dsa_platform_data *pd; - /* - * The switch port to which the CPU is attached. - */ - struct dsa_port *cpu_dp; - /* List of switch ports */ struct list_head ports; -- cgit v1.2.3 From 05f294a852358a46d9236cc777901f49a4f0ae85 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 21 Oct 2019 16:51:29 -0400 Subject: net: dsa: allocate ports on touch Allocate the struct dsa_port the first time it is accessed with dsa_port_touch, and remove the static dsa_port array from the dsa_switch structure. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index f572134eb5de..9bc1d3f71f89 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -277,9 +277,7 @@ struct dsa_switch { */ bool vlan_filtering; - /* Dynamically allocated ports, keep last */ size_t num_ports; - struct dsa_port ports[]; }; static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) -- cgit v1.2.3 From 7e99e34701728d54ccd0466eccf377a42b9db215 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 21 Oct 2019 16:51:30 -0400 Subject: net: dsa: remove dsa_switch_alloc helper Now that ports are dynamically listed in the fabric, there is no need to provide a special helper to allocate the dsa_switch structure. This will give more flexibility to drivers to embed this structure as they wish in their private structure. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 9bc1d3f71f89..e3c14dc3bab9 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -577,7 +577,6 @@ static inline bool dsa_can_decode(const struct sk_buff *skb, return false; } -struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n); void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); #ifdef CONFIG_PM_SLEEP -- cgit v1.2.3 From 5e5b03d163e15a40b0fa57c70b4e8edd549b0b98 Mon Sep 17 00:00:00 2001 From: "Ben Dooks (Codethink)" Date: Tue, 22 Oct 2019 13:59:25 +0100 Subject: xdp: Fix type of string pointer in __XDP_ACT_SYM_TAB The table entry in __XDP_ACT_SYM_TAB for the last item is set to { -1, 0 } where it should be { -1, NULL } as the second item is a pointer to a string. Fixes the following sparse warnings: ./include/trace/events/xdp.h:28:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:53:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:82:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:140:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:155:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:190:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:225:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:260:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:318:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:356:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:390:1: warning: Using plain integer as NULL pointer Signed-off-by: Ben Dooks (Codethink) Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191022125925.10508-1-ben.dooks@codethink.co.uk --- include/trace/events/xdp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 8c8420230a10..c7e3c9c5bad3 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -22,7 +22,7 @@ #define __XDP_ACT_SYM_FN(x) \ { XDP_##x, #x }, #define __XDP_ACT_SYM_TAB \ - __XDP_ACT_MAP(__XDP_ACT_SYM_FN) { -1, 0 } + __XDP_ACT_MAP(__XDP_ACT_SYM_FN) { -1, NULL } __XDP_ACT_MAP(__XDP_ACT_TP_FN) TRACE_EVENT(xdp_exception, -- cgit v1.2.3 From 71a8a63b9dbdeba8205a37979b81d4fba499d079 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 16 Oct 2019 14:23:55 +0200 Subject: netfilter: nf_flow_table: move priority to struct nf_flowtable Hardware offload needs access to the priority field, store this field in the nf_flowtable object. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 1 + include/net/netfilter/nf_tables.h | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index b37a7d608134..158514281a75 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -24,6 +24,7 @@ struct nf_flowtable_type { struct nf_flowtable { struct list_head list; struct rhashtable rhashtable; + int priority; const struct nf_flowtable_type *type; struct delayed_work gc_work; }; diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 001d294edf57..d529dfb5aa64 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1155,7 +1155,6 @@ void nft_unregister_obj(struct nft_object_type *obj_type); * @table: the table the flow table is contained in * @name: name of this flow table * @hooknum: hook number - * @priority: hook priority * @ops_len: number of hooks in array * @genmask: generation mask * @use: number of references to this flow table @@ -1169,7 +1168,6 @@ struct nft_flowtable { struct nft_table *table; char *name; int hooknum; - int priority; int ops_len; u32 genmask:2, use:30; -- cgit v1.2.3 From 3f0465a9ef02624e0a36db9e7c9bedcafcd6f6fe Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 16 Oct 2019 14:24:01 +0200 Subject: netfilter: nf_tables: dynamically allocate hooks per net_device in flowtables Use a list of hooks per device instead an array. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index d529dfb5aa64..7a2ac82ee0ad 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -963,6 +963,12 @@ struct nft_stats { struct u64_stats_sync syncp; }; +struct nft_hook { + struct list_head list; + struct nf_hook_ops ops; + struct rcu_head rcu; +}; + /** * struct nft_base_chain - nf_tables base chain * @@ -1173,7 +1179,7 @@ struct nft_flowtable { use:30; u64 handle; /* runtime data below here */ - struct nf_hook_ops *ops ____cacheline_aligned; + struct list_head hook_list ____cacheline_aligned; struct nf_flowtable data; }; -- cgit v1.2.3 From cb662ac6711f7135618526221498ebfae155531a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 16 Oct 2019 14:29:47 +0200 Subject: netfilter: nf_tables: increase maximum devices number per flowtable Rise the maximum limit of devices per flowtable up to 256. Rename NFT_FLOWTABLE_DEVICE_MAX to NFT_NETDEVICE_MAX in preparation to reuse the netdev hook parser for ingress basechain. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 7a2ac82ee0ad..3d71070e747a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1152,7 +1152,7 @@ struct nft_object_ops { int nft_register_obj(struct nft_object_type *obj_type); void nft_unregister_obj(struct nft_object_type *obj_type); -#define NFT_FLOWTABLE_DEVICE_MAX 8 +#define NFT_NETDEVICE_MAX 256 /** * struct nft_flowtable - nf_tables flow table -- cgit v1.2.3 From d54725cd11a57c30f650260cfb0a92c268bdc3e0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 16 Oct 2019 14:30:05 +0200 Subject: netfilter: nf_tables: support for multiple devices per netdev hook This patch allows you to register one netdev basechain to multiple devices. This adds a new NFTA_HOOK_DEVS netlink attribute to specify the list of netdevices. Basechains store a list of hooks. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++-- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3d71070e747a..5bf569e1173b 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -973,21 +973,21 @@ struct nft_hook { * struct nft_base_chain - nf_tables base chain * * @ops: netfilter hook ops + * @hook_list: list of netfilter hooks (for NFPROTO_NETDEV family) * @type: chain type * @policy: default policy * @stats: per-cpu chain stats * @chain: the chain - * @dev_name: device name that this base chain is attached to (if any) * @flow_block: flow block (for hardware offload) */ struct nft_base_chain { struct nf_hook_ops ops; + struct list_head hook_list; const struct nft_chain_type *type; u8 policy; u8 flags; struct nft_stats __percpu *stats; struct nft_chain chain; - char dev_name[IFNAMSIZ]; struct flow_block flow_block; }; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index ed8881ad18ed..81fed16fe2b2 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -144,12 +144,14 @@ enum nft_list_attributes { * @NFTA_HOOK_HOOKNUM: netfilter hook number (NLA_U32) * @NFTA_HOOK_PRIORITY: netfilter hook priority (NLA_U32) * @NFTA_HOOK_DEV: netdevice name (NLA_STRING) + * @NFTA_HOOK_DEVS: list of netdevices (NLA_NESTED) */ enum nft_hook_attributes { NFTA_HOOK_UNSPEC, NFTA_HOOK_HOOKNUM, NFTA_HOOK_PRIORITY, NFTA_HOOK_DEV, + NFTA_HOOK_DEVS, __NFTA_HOOK_MAX }; #define NFTA_HOOK_MAX (__NFTA_HOOK_MAX - 1) -- cgit v1.2.3 From fa6e98cee558622565c97924e922b97340aeabd8 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 22 Oct 2019 11:31:07 -0700 Subject: net: phy: add support for clause 37 auto-negotiation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds support for clause 37 1000Base-X auto-negotiation. Signed-off-by: Heiner Kallweit Signed-off-by: Tao Ren Tested-by: René van Dorst Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 9a0e981df502..78436d58ce7c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1106,6 +1106,10 @@ int genphy_read_mmd_unsupported(struct phy_device *phdev, int devad, int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum, u16 regnum, u16 val); +/* Clause 37 */ +int genphy_c37_config_aneg(struct phy_device *phydev); +int genphy_c37_read_status(struct phy_device *phydev); + /* Clause 45 PHY */ int genphy_c45_restart_aneg(struct phy_device *phydev); int genphy_c45_check_and_restart_aneg(struct phy_device *phydev, bool restart); -- cgit v1.2.3 From b9bcb95315febd09419ab870ddc7cb98a393f9d0 Mon Sep 17 00:00:00 2001 From: Tao Ren Date: Tue, 22 Oct 2019 11:31:08 -0700 Subject: net: phy: broadcom: add 1000Base-X support for BCM54616S The BCM54616S PHY cannot work properly in RGMII->1000Base-X mode, mainly because genphy functions are designed for copper links, and 1000Base-X (clause 37) auto negotiation needs to be handled differently. This patch enables 1000Base-X support for BCM54616S by customizing 3 driver callbacks, and it's verified to be working on Facebook CMM BMC platform (RGMII->1000Base-KX): - probe: probe callback detects PHY's operation mode based on INTERF_SEL[1:0] pins and 1000X/100FX selection bit in SerDES 100-FX Control register. - config_aneg: calls genphy_c37_config_aneg when the PHY is running in 1000Base-X mode; otherwise, genphy_config_aneg will be called. - read_status: calls genphy_c37_read_status when the PHY is running in 1000Base-X mode; otherwise, genphy_read_status will be called. Note: BCM54616S PHY can also be configured in RGMII->100Base-FX mode, and 100Base-FX support is not available as of now. Signed-off-by: Tao Ren Acked-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 6db2d9a6e503..b475e7f20d28 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -200,9 +200,15 @@ #define BCM5482_SHD_SSD 0x14 /* 10100: Secondary SerDes control */ #define BCM5482_SHD_SSD_LEDM 0x0008 /* SSD LED Mode enable */ #define BCM5482_SHD_SSD_EN 0x0001 /* SSD enable */ -#define BCM5482_SHD_MODE 0x1f /* 11111: Mode Control Register */ -#define BCM5482_SHD_MODE_1000BX 0x0001 /* Enable 1000BASE-X registers */ +/* 10011: SerDes 100-FX Control Register */ +#define BCM54616S_SHD_100FX_CTRL 0x13 +#define BCM54616S_100FX_MODE BIT(0) /* 100-FX SerDes Enable */ + +/* 11111: Mode Control Register */ +#define BCM54XX_SHD_MODE 0x1f +#define BCM54XX_SHD_INTF_SEL_MASK GENMASK(2, 1) /* INTERF_SEL[1:0] */ +#define BCM54XX_SHD_MODE_1000BX BIT(0) /* Enable 1000-X registers */ /* * EXPANSION SHADOW ACCESS REGISTERS. (PHY REG 0x15, 0x16, and 0x17) -- cgit v1.2.3 From 3820729160440158a014add69cc0d371061a96b2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Oct 2019 17:18:11 -0700 Subject: bpf: Prepare btf_ctx_access for non raw_tp use case This patch makes a few changes to btf_ctx_access() to prepare it for non raw_tp use case where the attach_btf_id is not necessary a BTF_KIND_TYPEDEF. It moves the "btf_trace_" prefix check and typedef-follow logic to a new function "check_attach_btf_id()" which is called only once during bpf_check(). btf_ctx_access() only operates on a BTF_KIND_FUNC_PROTO type now. That should also be more efficient since it is done only one instead of every-time check_ctx_access() is called. "check_attach_btf_id()" needs to find the func_proto type from the attach_btf_id. It needs to store the result into the newly added prog->aux->attach_func_proto. func_proto btf type has no name, so a proper name should be stored into "attach_func_name" also. v2: - Move the "btf_trace_" check to an earlier verifier phase (Alexei) Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191025001811.1718491-1-kafai@fb.com --- include/linux/bpf.h | 5 +++++ include/linux/btf.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2c2c29b49845..171be30fe0ae 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -392,6 +392,11 @@ struct bpf_prog_aux { u32 attach_btf_id; /* in-kernel BTF type id to attach to */ bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; + bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ + /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ + const struct btf_type *attach_func_proto; + /* function name for valid attach_btf_id */ + const char *attach_func_name; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ struct latch_tree_node ksym_tnode; diff --git a/include/linux/btf.h b/include/linux/btf.h index 55d43bc856be..9dee00859c5f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -5,6 +5,7 @@ #define _LINUX_BTF_H 1 #include +#include struct btf; struct btf_member; @@ -53,6 +54,36 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); bool btf_type_is_void(const struct btf_type *t); +static inline bool btf_type_is_ptr(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; +} + +static inline bool btf_type_is_int(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_INT; +} + +static inline bool btf_type_is_enum(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM; +} + +static inline bool btf_type_is_typedef(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF; +} + +static inline bool btf_type_is_func(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC; +} + +static inline bool btf_type_is_func_proto(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; +} + #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); -- cgit v1.2.3 From 480274787d7e3458bc5a7cfbbbe07033984ad711 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 23 Oct 2019 11:09:26 -0400 Subject: tcp: add TCP_INFO status for failed client TFO The TCPI_OPT_SYN_DATA bit as part of tcpi_options currently reports whether or not data-in-SYN was ack'd on both the client and server side. We'd like to gather more information on the client-side in the failure case in order to indicate the reason for the failure. This can be useful for not only debugging TFO, but also for creating TFO socket policies. For example, if a middle box removes the TFO option or drops a data-in-SYN, we can can detect this case, and turn off TFO for these connections saving the extra retransmits. The newly added tcpi_fastopen_client_fail status is 2 bits and has the following 4 states: 1) TFO_STATUS_UNSPEC Catch-all state which includes when TFO is disabled via black hole detection, which is indicated via LINUX_MIB_TCPFASTOPENBLACKHOLE. 2) TFO_COOKIE_UNAVAILABLE If TFO_CLIENT_NO_COOKIE mode is off, this state indicates that no cookie is available in the cache. 3) TFO_DATA_NOT_ACKED Data was sent with SYN, we received a SYN/ACK but it did not cover the data portion. Cookie is not accepted by server because the cookie may be invalid or the server may be overloaded. 4) TFO_SYN_RETRANSMITTED Data was sent with SYN, we received a SYN/ACK which did not cover the data after at least 1 additional SYN was sent (without data). It may be the case that a middle-box is dropping data-in-SYN packets. Thus, it would be more efficient to not use TFO on this connection to avoid extra retransmits during connection establishment. These new fields do not cover all the cases where TFO may fail, but other failures, such as SYN/ACK + data being dropped, will result in the connection not becoming established. And a connection blackhole after session establishment shows up as a stalled connection. Signed-off-by: Jason Baron Cc: Eric Dumazet Cc: Neal Cardwell Cc: Christoph Paasch Cc: Yuchung Cheng Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- include/uapi/linux/tcp.h | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 668e25a76d69..ca6f01531e64 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -223,7 +223,7 @@ struct tcp_sock { fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ - unused:2; + fastopen_client_fail:2; /* reason why fastopen failed */ u8 nonagle : 4,/* Disable Nagle algorithm? */ thin_lto : 1,/* Use linear timeouts for thin streams */ recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 81e697978e8b..74af1f759cee 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -155,6 +155,14 @@ enum { TCP_QUEUES_NR, }; +/* why fastopen failed from client perspective */ +enum tcp_fastopen_client_fail { + TFO_STATUS_UNSPEC, /* catch-all */ + TFO_COOKIE_UNAVAILABLE, /* if not in TFO_CLIENT_NO_COOKIE mode */ + TFO_DATA_NOT_ACKED, /* SYN-ACK did not ack SYN data */ + TFO_SYN_RETRANSMITTED, /* SYN-ACK did not ack SYN data after timeout */ +}; + /* for TCP_INFO socket option */ #define TCPI_OPT_TIMESTAMPS 1 #define TCPI_OPT_SACK 2 @@ -211,7 +219,7 @@ struct tcp_info { __u8 tcpi_backoff; __u8 tcpi_options; __u8 tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4; - __u8 tcpi_delivery_rate_app_limited:1; + __u8 tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2; __u32 tcpi_rto; __u32 tcpi_ato; -- cgit v1.2.3 From ae4a50ee3151d6cb11c56297699ca9025eb18077 Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Fri, 25 Oct 2019 10:36:47 +1300 Subject: mac80211: typo fixes in kerneldoc comments Correct some trivial typos in kerneldoc comments. Signed-off-by: Chris Packham Link: https://lore.kernel.org/r/20191024213647.5507-1-chris.packham@alliedtelesis.co.nz Signed-off-by: Johannes Berg --- include/net/mac80211.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d69081c38788..67866fa1328d 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -312,7 +312,7 @@ struct ieee80211_vif_chanctx_switch { * @BSS_CHANGED_KEEP_ALIVE: keep alive options (idle period or protected * keep alive) changed. * @BSS_CHANGED_MCAST_RATE: Multicast Rate setting changed for this interface - * @BSS_CHANGED_FTM_RESPONDER: fime timing reasurement request responder + * @BSS_CHANGED_FTM_RESPONDER: fine timing measurement request responder * functionality changed for this BSS (AP mode). * @BSS_CHANGED_TWT: TWT status changed * @BSS_CHANGED_HE_OBSS_PD: OBSS Packet Detection status changed. @@ -1059,7 +1059,7 @@ struct ieee80211_tx_info { }; /** - * struct ieee80211_tx_status - extended tx staus info for rate control + * struct ieee80211_tx_status - extended tx status info for rate control * * @sta: Station that the packet was transmitted for * @info: Basic tx status information @@ -1702,7 +1702,7 @@ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif); * %IEEE80211_KEY_FLAG_SW_MGMT_TX flag to encrypt such frames in SW. * @IEEE80211_KEY_FLAG_GENERATE_IV_MGMT: This flag should be set by the * driver for a CCMP/GCMP key to indicate that is requires IV generation - * only for managment frames (MFP). + * only for management frames (MFP). * @IEEE80211_KEY_FLAG_RESERVE_TAILROOM: This flag should be set by the * driver for a key to indicate that sufficient tailroom must always * be reserved for ICV or MIC, even when HW encryption is enabled. @@ -1998,7 +1998,7 @@ struct ieee80211_sta { * * * If the skb is transmitted as part of a BA agreement, the * A-MSDU maximal size is min(max_amsdu_len, 4065) bytes. - * * If the skb is not part of a BA aggreement, the A-MSDU maximal + * * If the skb is not part of a BA agreement, the A-MSDU maximal * size is min(max_amsdu_len, 7935) bytes. * * Both additional HT limits must be enforced by the low level @@ -3187,13 +3187,13 @@ enum ieee80211_rate_control_changed { * * With the support for multi channel contexts and multi channel operations, * remain on channel operations might be limited/deferred/aborted by other - * flows/operations which have higher priority (and vise versa). + * flows/operations which have higher priority (and vice versa). * Specifying the ROC type can be used by devices to prioritize the ROC * operations compared to other operations/flows. * * @IEEE80211_ROC_TYPE_NORMAL: There are no special requirements for this ROC. * @IEEE80211_ROC_TYPE_MGMT_TX: The remain on channel request is required - * for sending managment frames offchannel. + * for sending management frames offchannel. */ enum ieee80211_roc_type { IEEE80211_ROC_TYPE_NORMAL = 0, @@ -5616,7 +5616,7 @@ void ieee80211_iter_keys_rcu(struct ieee80211_hw *hw, /** * ieee80211_iter_chan_contexts_atomic - iterate channel contexts - * @hw: pointre obtained from ieee80211_alloc_hw(). + * @hw: pointer obtained from ieee80211_alloc_hw(). * @iter: iterator function * @iter_data: data passed to iterator function * @@ -6364,7 +6364,7 @@ ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, * again. * * The API ieee80211_txq_may_transmit() also ensures that TXQ list will be - * aligned aginst driver's own round-robin scheduler list. i.e it rotates + * aligned against driver's own round-robin scheduler list. i.e it rotates * the TXQ list till it makes the requested node becomes the first entry * in TXQ list. Thus both the TXQ list and driver's list are in sync. If this * function returns %true, the driver is expected to schedule packets -- cgit v1.2.3 From 3f2aef10ffad76c31275ae66b1d6e486b22619d6 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 24 Oct 2019 11:32:12 -0700 Subject: mac80211: fix a typo of "function" Signed-off-by: Joe Perches Link: https://lore.kernel.org/r/4d53be6c963542878d370ff1a6dc7c3a89b28d23.camel@perches.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 67866fa1328d..f5996960eace 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2626,7 +2626,7 @@ ieee80211_get_alt_retry_rate(const struct ieee80211_hw *hw, * @hw: the hardware * @skb: the skb * - * Free a transmit skb. Use this funtion when some failure + * Free a transmit skb. Use this function when some failure * to transmit happened and thus status cannot be reported. */ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); -- cgit v1.2.3 From c199ce4f9dd896c716aece33e6750be34aea1151 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 24 Oct 2019 17:22:01 +0200 Subject: net: Fix misspellings of "configure" and "configuration" Fix various misspellings of "configuration" and "configure". Signed-off-by: Geert Uytterhoeven Acked-by: Kalle Valo Signed-off-by: David S. Miller --- include/uapi/linux/dcbnl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h index 69df19aa8e72..a791a94013a6 100644 --- a/include/uapi/linux/dcbnl.h +++ b/include/uapi/linux/dcbnl.h @@ -286,7 +286,7 @@ struct dcbmsg { * @DCB_CMD_GNUMTCS: get the number of traffic classes currently supported * @DCB_CMD_SNUMTCS: set the number of traffic classes * @DCB_CMD_GBCN: set backward congestion notification configuration - * @DCB_CMD_SBCN: get backward congestion notification configration. + * @DCB_CMD_SBCN: get backward congestion notification configuration. * @DCB_CMD_GAPP: get application protocol configuration * @DCB_CMD_SAPP: set application protocol configuration * @DCB_CMD_IEEE_SET: set IEEE 802.1Qaz configuration -- cgit v1.2.3 From e1b185491f739983b596804953586346e50351c9 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 24 Oct 2019 17:23:23 +0200 Subject: net: Fix various misspellings of "connect" Fix misspellings of "disconnect", "disconnecting", "connections", and "disconnected". Signed-off-by: Geert Uytterhoeven Acked-by: Kalle Valo Acked-by: Simon Horman Signed-off-by: David S. Miller --- include/net/cfg80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 4ab2c49423dc..ab6850bbba99 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6593,7 +6593,7 @@ struct cfg80211_roam_info { * time it is accessed in __cfg80211_roamed() due to delay in scheduling * rdev->event_work. In case of any failures, the reference is released * either in cfg80211_roamed() or in __cfg80211_romed(), Otherwise, it will be - * released while diconneting from the current bss. + * released while disconnecting from the current bss. */ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, gfp_t gfp); -- cgit v1.2.3 From 6b297524234ccf3954b54609ab6bc2e8c4d3f677 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Fri, 25 Oct 2019 01:03:51 +0200 Subject: net: dsa: Add support for devlink device parameters Add plumbing to allow DSA drivers to register parameters with devlink. To keep with the abstraction, the DSA drivers pass the ds structure to these helpers, and the DSA core then translates that to the devlink structure associated to the device. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index e3c14dc3bab9..d5f6e5ccca38 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -550,6 +550,29 @@ struct dsa_switch_ops { */ netdev_tx_t (*port_deferred_xmit)(struct dsa_switch *ds, int port, struct sk_buff *skb); + /* Devlink parameters */ + int (*devlink_param_get)(struct dsa_switch *ds, u32 id, + struct devlink_param_gset_ctx *ctx); + int (*devlink_param_set)(struct dsa_switch *ds, u32 id, + struct devlink_param_gset_ctx *ctx); +}; + +#define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ + DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes, \ + dsa_devlink_param_get, dsa_devlink_param_set, NULL) + +int dsa_devlink_param_get(struct devlink *dl, u32 id, + struct devlink_param_gset_ctx *ctx); +int dsa_devlink_param_set(struct devlink *dl, u32 id, + struct devlink_param_gset_ctx *ctx); +int dsa_devlink_params_register(struct dsa_switch *ds, + const struct devlink_param *params, + size_t params_count); +void dsa_devlink_params_unregister(struct dsa_switch *ds, + const struct devlink_param *params, + size_t params_count); +struct dsa_devlink_priv { + struct dsa_switch *ds; }; struct dsa_switch_driver { -- cgit v1.2.3 From d607525bd912860aad137326a1076d1e9880ddf0 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 25 Oct 2019 14:48:53 -0400 Subject: net: dsa: return directly from dsa_to_port Return directly from within the loop as soon as the port is found, otherwise we won't return NULL if the end of the list is reached. Fixes: b96ddf254b09 ("net: dsa: use ports list in dsa_to_port") Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index d5f6e5ccca38..9aba326abb64 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -283,13 +283,13 @@ struct dsa_switch { static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) { struct dsa_switch_tree *dst = ds->dst; - struct dsa_port *dp = NULL; + struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dp->ds == ds && dp->index == p) - break; + return dp; - return dp; + return NULL; } static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) -- cgit v1.2.3 From 8466a57dfbb0c9bf6db4685ed9c4144b8deec688 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Tue, 29 Oct 2019 12:43:46 +0100 Subject: net/smc: remove unneeded include for smc.h The only smc-related reference in net/sock.h is struct smc_hashinfo. But just its address is refered to. Thus there is no need for the include of net/smc.h. Remove it. Suggested-by: Jakub Kicinski Reviewed by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- include/net/sock.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 380312cc67a9..09c26a5ecbff 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -66,7 +66,6 @@ #include #include #include -#include #include /* -- cgit v1.2.3 From c0bceb97db9efc72629dd00cd0d9812f24d4ba2d Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 30 Oct 2019 14:00:41 +0100 Subject: tipc: add smart nagle feature We introduce a feature that works like a combination of TCP_NAGLE and TCP_CORK, but without some of the weaknesses of those. In particular, we will not observe long delivery delays because of delayed acks, since the algorithm itself decides if and when acks are to be sent from the receiving peer. - The nagle property as such is determined by manipulating a new 'maxnagle' field in struct tipc_sock. If certain conditions are met, 'maxnagle' will define max size of the messages which can be bundled. If it is set to zero no messages are ever bundled, implying that the nagle property is disabled. - A socket with the nagle property enabled enters nagle mode when more than 4 messages have been sent out without receiving any data message from the peer. - A socket leaves nagle mode whenever it receives a data message from the peer. In nagle mode, messages smaller than 'maxnagle' are accumulated in the socket write queue. The last buffer in the queue is marked with a new 'ack_required' bit, which forces the receiving peer to send a CONN_ACK message back to the sender upon reception. The accumulated contents of the write queue is transmitted when one of the following events or conditions occur. - A CONN_ACK message is received from the peer. - A data message is received from the peer. - A SOCK_WAKEUP pseudo message is received from the link level. - The write queue contains more than 64 1k blocks of data. - The connection is being shut down. - There is no CONN_ACK message to expect. I.e., there is currently no outstanding message where the 'ack_required' bit was set. As a consequence, the first message added after we enter nagle mode is always sent directly with this bit set. This new feature gives a 50-100% improvement of throughput for small (i.e., less than MTU size) messages, while it might add up to one RTT to latency time when the socket is in nagle mode. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 7df026ea6aff..76421b878767 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -191,6 +191,7 @@ struct sockaddr_tipc { #define TIPC_GROUP_JOIN 135 /* Takes struct tipc_group_req* */ #define TIPC_GROUP_LEAVE 136 /* No argument */ #define TIPC_SOCK_RECVQ_USED 137 /* Default: none (read only) */ +#define TIPC_NODELAY 138 /* Default: false */ /* * Flag values -- cgit v1.2.3 From 98298e6ca6d5908f96e529e70a254a4d5bf754e7 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 29 Oct 2019 14:50:50 +0100 Subject: flow_dissector: add meaningful comments Documents two piece of code which can't be understood at a glance. Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 90bd210be060..7747af3cc500 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -282,6 +282,7 @@ struct flow_keys { struct flow_dissector_key_vlan cvlan; struct flow_dissector_key_keyid keyid; struct flow_dissector_key_ports ports; + /* 'addrs' must be the last member */ struct flow_dissector_key_addrs addrs; }; -- cgit v1.2.3 From 5dec597e5cd0f4c3000d120508efa64157d5bd7a Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 29 Oct 2019 14:50:52 +0100 Subject: flow_dissector: extract more ICMP information The ICMP flow dissector currently parses only the Type and Code fields. Some ICMP packets (echo, timestamp) have a 16 bit Identifier field which is used to correlate packets. Add such field in flow_dissector_key_icmp and replace skb_flow_get_be16() with a more complex function which populate this field. Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 7747af3cc500..f8541d018848 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -6,6 +6,8 @@ #include #include +struct sk_buff; + /** * struct flow_dissector_key_control: * @thoff: Transport header offset @@ -156,19 +158,16 @@ struct flow_dissector_key_ports { /** * flow_dissector_key_icmp: - * @ports: type and code of ICMP header - * icmp: ICMP type (high) and code (low) * type: ICMP type * code: ICMP code + * id: session identifier */ struct flow_dissector_key_icmp { - union { - __be16 icmp; - struct { - u8 type; - u8 code; - }; + struct { + u8 type; + u8 code; }; + u16 id; }; /** @@ -282,6 +281,7 @@ struct flow_keys { struct flow_dissector_key_vlan cvlan; struct flow_dissector_key_keyid keyid; struct flow_dissector_key_ports ports; + struct flow_dissector_key_icmp icmp; /* 'addrs' must be the last member */ struct flow_dissector_key_addrs addrs; }; @@ -316,6 +316,9 @@ static inline bool flow_keys_have_l4(const struct flow_keys *keys) } u32 flow_hash_from_keys(struct flow_keys *keys); +void skb_flow_get_icmp_tci(const struct sk_buff *skb, + struct flow_dissector_key_icmp *key_icmp, + void *data, int thoff, int hlen); static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) -- cgit v1.2.3 From c8ecebd04cbb6badb46d42fe54282e7883ed63cc Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 30 Oct 2019 16:09:00 +0200 Subject: net: sched: extract common action counters update code into function Currently, all implementations of tc_action_ops->stats_update() callback have almost exactly the same implementation of counters update code (besides gact which also updates drop counter). In order to simplify support for using both percpu-allocated and regular action counters depending on run-time flag in following patches, extract action counters update code into standalone function in act API. This commit doesn't change functionality. Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index b18c699681ca..f6f66c692385 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -186,6 +186,8 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); +void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets, + bool drop, bool hw); int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); int tcf_action_check_ctrlact(int action, struct tcf_proto *tp, -- cgit v1.2.3 From 5e1ad95b630e652d3467d1fd1f0b5e5ea2c441e2 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 30 Oct 2019 16:09:01 +0200 Subject: net: sched: extract bstats update code into function Extract common code that increments cpu_bstats counter into standalone act API function. Change hardware offloaded actions that use percpu counter allocation to use the new function instead of incrementing cpu_bstats directly. This commit doesn't change functionality. Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index f6f66c692385..9a32853f77f9 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -186,6 +186,13 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); + +static inline void tcf_action_update_bstats(struct tc_action *a, + struct sk_buff *skb) +{ + bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); +} + void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets, bool drop, bool hw); int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); -- cgit v1.2.3 From 26b537a88ca5b7399c7ab0656e06dbd9da9513c1 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 30 Oct 2019 16:09:02 +0200 Subject: net: sched: extract qstats update code into functions Extract common code that increments cpu_qstats counters into standalone act API functions. Change hardware offloaded actions that use percpu counter allocation to use the new functions instead of accessing cpu_qstats directly. This commit doesn't change functionality. Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 9a32853f77f9..8d6861ce205b 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -193,6 +193,22 @@ static inline void tcf_action_update_bstats(struct tc_action *a, bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); } +static inline struct gnet_stats_queue * +tcf_action_get_qstats(struct tc_action *a) +{ + return this_cpu_ptr(a->cpu_qstats); +} + +static inline void tcf_action_inc_drop_qstats(struct tc_action *a) +{ + qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); +} + +static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a) +{ + qstats_overlimit_inc(this_cpu_ptr(a->cpu_qstats)); +} + void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets, bool drop, bool hw); int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); -- cgit v1.2.3 From ef816f3c49c1c404ababc50e10d4cbe5109da678 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 30 Oct 2019 16:09:03 +0200 Subject: net: sched: don't expose action qstats to skb_tc_reinsert() Previous commit introduced helper function for updating qstats and refactored set of actions to use the helpers, instead of modifying qstats directly. However, one of the affected action exposes its qstats to skb_tc_reinsert(), which then modifies it. Refactor skb_tc_reinsert() to return integer error code and don't increment overlimit qstats in case of error, and use the returned error code in tcf_mirred_act() to manually increment the overlimit counter with new helper function. Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 637548d54b3e..a8b0a9a4c686 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1286,17 +1286,9 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, struct mini_Qdisc __rcu **p_miniq); -static inline void skb_tc_reinsert(struct sk_buff *skb, struct tcf_result *res) +static inline int skb_tc_reinsert(struct sk_buff *skb, struct tcf_result *res) { - struct gnet_stats_queue *stats = res->qstats; - int ret; - - if (res->ingress) - ret = netif_receive_skb(skb); - else - ret = dev_queue_xmit(skb); - if (ret && stats) - qstats_overlimit_inc(res->qstats); + return res->ingress ? netif_receive_skb(skb) : dev_queue_xmit(skb); } #endif -- cgit v1.2.3 From 5e174d5e73dfbfb2c4bc4804f58f2f2aa34c9281 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 30 Oct 2019 16:09:04 +0200 Subject: net: sched: modify stats helper functions to support regular stats Modify stats update helper functions introduced in previous patches in this series to fallback to regular tc_action->tcfa_{b|q}stats if cpu stats are not allocated for the action argument. If regular non-percpu allocated counters are in use, then obtain action tcfa_lock while modifying them. Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 8d6861ce205b..a56477051dae 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -190,23 +190,35 @@ int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); static inline void tcf_action_update_bstats(struct tc_action *a, struct sk_buff *skb) { - bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); -} - -static inline struct gnet_stats_queue * -tcf_action_get_qstats(struct tc_action *a) -{ - return this_cpu_ptr(a->cpu_qstats); + if (likely(a->cpu_bstats)) { + bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); + return; + } + spin_lock(&a->tcfa_lock); + bstats_update(&a->tcfa_bstats, skb); + spin_unlock(&a->tcfa_lock); } static inline void tcf_action_inc_drop_qstats(struct tc_action *a) { - qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); + if (likely(a->cpu_qstats)) { + qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); + return; + } + spin_lock(&a->tcfa_lock); + qstats_drop_inc(&a->tcfa_qstats); + spin_unlock(&a->tcfa_lock); } static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a) { - qstats_overlimit_inc(this_cpu_ptr(a->cpu_qstats)); + if (likely(a->cpu_qstats)) { + qstats_overlimit_inc(this_cpu_ptr(a->cpu_qstats)); + return; + } + spin_lock(&a->tcfa_lock); + qstats_overlimit_inc(&a->tcfa_qstats); + spin_unlock(&a->tcfa_lock); } void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets, -- cgit v1.2.3 From abbb0d33632ce931ca9c814813ee131351f6b92f Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 30 Oct 2019 16:09:05 +0200 Subject: net: sched: extend TCA_ACT space with TCA_ACT_FLAGS Extend TCA_ACT space with nla_bitfield32 flags. Add TCA_ACT_FLAGS_NO_PERCPU_STATS as the only allowed flag. Parse the flags in tcf_action_init_1() and pass resulting value as additional argument to a_o->init(). Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- include/uapi/linux/pkt_cls.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index a56477051dae..85e95c44c7f9 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -94,7 +94,7 @@ struct tc_action_ops { int (*init)(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack); + u32 flags, struct netlink_ext_ack *extack); int (*walk)(struct net *, struct sk_buff *, struct netlink_callback *, int, const struct tc_action_ops *, diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index a6aa466fac9e..c6ad22f76ede 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -16,9 +16,14 @@ enum { TCA_ACT_STATS, TCA_ACT_PAD, TCA_ACT_COOKIE, + TCA_ACT_FLAGS, __TCA_ACT_MAX }; +#define TCA_ACT_FLAGS_NO_PERCPU_STATS 1 /* Don't use percpu allocator for + * actions stats. + */ + #define TCA_ACT_MAX __TCA_ACT_MAX #define TCA_OLD_COMPAT (TCA_ACT_MAX+1) #define TCA_ACT_MAX_PRIO 32 -- cgit v1.2.3 From e38226786022d2d8e5876ab7bc37e82b0eb57e65 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 30 Oct 2019 16:09:06 +0200 Subject: net: sched: update action implementations to support flags Extend struct tc_action with new "tcfa_flags" field. Set the field in tcf_idr_create() function and provide new helper tcf_idr_create_from_flags() that derives 'cpustats' boolean from flags value. Update individual hardware-offloaded actions init() to pass their "flags" argument to new helper in order to skip percpu stats allocation when user requested it through flags. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- include/net/act_api.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 85e95c44c7f9..0495bdc034d2 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -41,6 +41,7 @@ struct tc_action { struct gnet_stats_queue __percpu *cpu_qstats; struct tc_cookie __rcu *act_cookie; struct tcf_chain __rcu *goto_chain; + u32 tcfa_flags; }; #define tcf_index common.tcfa_index #define tcf_refcnt common.tcfa_refcnt @@ -154,7 +155,11 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index); int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, - int bind, bool cpustats); + int bind, bool cpustats, u32 flags); +int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, + struct nlattr *est, struct tc_action **a, + const struct tc_action_ops *ops, int bind, + u32 flags); void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a); void tcf_idr_cleanup(struct tc_action_net *tn, u32 index); -- cgit v1.2.3 From f1b9509c2fb0ef4db8d22dac9aef8e856a5d81f6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Oct 2019 15:32:11 -0700 Subject: bpf: Replace prog_raw_tp+btf_id with prog_tracing The bpf program type raw_tp together with 'expected_attach_type' was the most appropriate api to indicate BTF-enabled raw_tp programs. But during development it became apparent that 'expected_attach_type' cannot be used and new 'attach_btf_id' field had to be introduced. Which means that the information is duplicated in two fields where one of them is ignored. Clean it up by introducing new program type where both 'expected_attach_type' and 'attach_btf_id' fields have specific meaning. In the future 'expected_attach_type' will be extended with other attach points that have similar semantics to raw_tp. This patch is replacing BTF-enabled BPF_PROG_TYPE_RAW_TRACEPOINT with prog_type = BPF_RPOG_TYPE_TRACING expected_attach_type = BPF_TRACE_RAW_TP attach_btf_id = btf_id of raw tracepoint inside the kernel Future patches will add expected_attach_type = BPF_TRACE_FENTRY or BPF_TRACE_FEXIT where programs have the same input context and the same helpers, but different attach points. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191030223212.953010-2-ast@kernel.org --- include/linux/bpf.h | 5 +++++ include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 2 ++ 3 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 171be30fe0ae..80158cff44bd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -373,6 +373,11 @@ enum bpf_cgroup_storage_type { #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX +/* The longest tracepoint has 12 args. + * See include/trace/bpf_probe.h + */ +#define MAX_BPF_FUNC_ARGS 12 + struct bpf_prog_stats { u64 cnt; u64 nsecs; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 36a9c2325176..de14872b01ba 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -26,6 +26,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) +BPF_PROG_TYPE(BPF_PROG_TYPE_TRACING, tracing) #endif #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4af8b0819a32..a6bf19dabaab 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -173,6 +173,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, BPF_PROG_TYPE_CGROUP_SOCKOPT, + BPF_PROG_TYPE_TRACING, }; enum bpf_attach_type { @@ -199,6 +200,7 @@ enum bpf_attach_type { BPF_CGROUP_UDP6_RECVMSG, BPF_CGROUP_GETSOCKOPT, BPF_CGROUP_SETSOCKOPT, + BPF_TRACE_RAW_TP, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 246880958ac93989c97c73ae1e60b78b4c4c88c5 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Thu, 31 Oct 2019 15:38:50 +0530 Subject: firmware: broadcom: add OP-TEE based BNXT f/w manager This driver registers on TEE bus to interact with OP-TEE based BNXT firmware management modules Cc: Jakub Kicinski Reported-by: kbuild test robot Signed-off-by: Vikas Gupta Signed-off-by: Sheetal Tigadoli Signed-off-by: David S. Miller --- include/linux/firmware/broadcom/tee_bnxt_fw.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 include/linux/firmware/broadcom/tee_bnxt_fw.h (limited to 'include') diff --git a/include/linux/firmware/broadcom/tee_bnxt_fw.h b/include/linux/firmware/broadcom/tee_bnxt_fw.h new file mode 100644 index 000000000000..f24c82d6ef73 --- /dev/null +++ b/include/linux/firmware/broadcom/tee_bnxt_fw.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * Copyright 2019 Broadcom. + */ + +#ifndef _BROADCOM_TEE_BNXT_FW_H +#define _BROADCOM_TEE_BNXT_FW_H + +#include + +int tee_bnxt_fw_load(void); +int tee_bnxt_copy_coredump(void *buf, u32 offset, u32 size); + +#endif /* _BROADCOM_TEE_BNXT_FW_H */ -- cgit v1.2.3 From a2d00f3db73dc4f6f6afcc95c1db809ea9019306 Mon Sep 17 00:00:00 2001 From: Madalin Bucur Date: Thu, 31 Oct 2019 16:37:58 +0200 Subject: soc: fsl: qbman: allow registering a device link for the portal user Introduce the API required to make sure that the devices that use the QMan portal are unbound when the portal is unbound. Signed-off-by: Madalin Bucur Signed-off-by: David S. Miller --- include/soc/fsl/qman.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/soc/fsl/qman.h b/include/soc/fsl/qman.h index aa31c05a103a..c499c5cfa7c9 100644 --- a/include/soc/fsl/qman.h +++ b/include/soc/fsl/qman.h @@ -32,6 +32,7 @@ #define __FSL_QMAN_H #include +#include /* Hardware constants */ #define QM_CHANNEL_SWPORTAL0 0 @@ -914,6 +915,23 @@ u16 qman_affine_channel(int cpu); */ struct qman_portal *qman_get_affine_portal(int cpu); +/** + * qman_start_using_portal - register a device link for the portal user + * @p: the portal that will be in use + * @dev: the device that will use the portal + * + * Makes sure that the devices that use the portal are unbound when the + * portal is unbound + */ +int qman_start_using_portal(struct qman_portal *p, struct device *dev); + +/** + * qman_stop_using_portal - deregister a device link for the portal user + * @p: the portal that will no longer be in use + * @dev: the device that uses the portal + */ +void qman_stop_using_portal(struct qman_portal *p, struct device *dev); + /** * qman_p_poll_dqrr - process DQRR (fast-path) entries * @limit: the maximum number of DQRR entries to process -- cgit v1.2.3 From e06eea555b878f2c95b498aa1c485250ad30c960 Mon Sep 17 00:00:00 2001 From: Madalin Bucur Date: Thu, 31 Oct 2019 16:37:59 +0200 Subject: dpaa_eth: register a device link for the qman portal used Before this change, unbinding the QMan portals did not trigger a corresponding unbinding of the dpaa_eth making use of it; the first QMan portal related operation issued afterwards crashed the kernel. The device link ensures the dpaa_eth dependency upon the qman portal used is honoured at the QMan portal removal. Signed-off-by: Madalin Bucur Signed-off-by: David S. Miller --- include/soc/fsl/qman.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/soc/fsl/qman.h b/include/soc/fsl/qman.h index c499c5cfa7c9..cfe00e08e85b 100644 --- a/include/soc/fsl/qman.h +++ b/include/soc/fsl/qman.h @@ -925,13 +925,6 @@ struct qman_portal *qman_get_affine_portal(int cpu); */ int qman_start_using_portal(struct qman_portal *p, struct device *dev); -/** - * qman_stop_using_portal - deregister a device link for the portal user - * @p: the portal that will no longer be in use - * @dev: the device that uses the portal - */ -void qman_stop_using_portal(struct qman_portal *p, struct device *dev); - /** * qman_p_poll_dqrr - process DQRR (fast-path) entries * @limit: the maximum number of DQRR entries to process -- cgit v1.2.3 From 1ac210d128ef6e92698dd3aa4e2e03e831bc9906 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Thu, 31 Oct 2019 01:18:29 +0200 Subject: bus: fsl-mc: add the fsl_mc_get_endpoint function Using the newly added fsl_mc_get_endpoint function a fsl-mc driver can find its associated endpoint (another object at the other link of a MC firmware link). The API will be used in the following patch in order to discover the connected DPMAC object of a DPNI. Also, the fsl_mc_device_lookup function is made available to the entire fsl-mc bus driver and not just for the dprc driver. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- include/linux/fsl/mc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index 975553a9f75d..54d9436600c7 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -403,6 +403,8 @@ int __must_check fsl_mc_allocate_irqs(struct fsl_mc_device *mc_dev); void fsl_mc_free_irqs(struct fsl_mc_device *mc_dev); +struct fsl_mc_device *fsl_mc_get_endpoint(struct fsl_mc_device *mc_dev); + extern struct bus_type fsl_mc_bus_type; extern struct device_type fsl_mc_bus_dprc_type; -- cgit v1.2.3 From c5f51765a1f60b701840544faf3ca63204b8dc3c Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 30 Oct 2019 22:09:13 -0400 Subject: net: dsa: list DSA links in the fabric Implement a new list of DSA links in the switch fabric itself, to provide an alterative to the ds->rtable static arrays. At the same time, provide a new dsa_routing_port() helper to abstract the usage of ds->rtable in drivers. If there's no port to reach a given device, return the first invalid port, ds->num_ports. This avoids potential signedness errors or the need to define special values. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 9aba326abb64..3d7366d634d8 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -123,6 +123,9 @@ struct dsa_switch_tree { /* List of switch ports */ struct list_head ports; + /* List of DSA links composing the routing table */ + struct list_head rtable; + /* * Data for the individual switch chips. */ @@ -214,6 +217,17 @@ struct dsa_port { bool setup; }; +/* TODO: ideally DSA ports would have a single dp->link_dp member, + * and no dst->rtable nor this struct dsa_link would be needed, + * but this would require some more complex tree walking, + * so keep it stupid at the moment and list them all. + */ +struct dsa_link { + struct dsa_port *dp; + struct dsa_port *link_dp; + struct list_head list; +}; + struct dsa_switch { bool setup; @@ -324,6 +338,19 @@ static inline u32 dsa_user_ports(struct dsa_switch *ds) return mask; } +/* Return the local port used to reach an arbitrary switch device */ +static inline unsigned int dsa_routing_port(struct dsa_switch *ds, int device) +{ + struct dsa_switch_tree *dst = ds->dst; + struct dsa_link *dl; + + list_for_each_entry(dl, &dst->rtable, list) + if (dl->dp->ds == ds && dl->link_dp->ds->index == device) + return dl->dp->index; + + return ds->num_ports; +} + /* Return the local port used to reach an arbitrary switch port */ static inline unsigned int dsa_towards_port(struct dsa_switch *ds, int device, int port) @@ -331,7 +358,7 @@ static inline unsigned int dsa_towards_port(struct dsa_switch *ds, int device, if (device == ds->index) return port; else - return ds->rtable[device]; + return dsa_routing_port(ds, device); } /* Return the local port used to reach the dedicated CPU port */ -- cgit v1.2.3 From 96252b8e05326df072cd321159878aa4725c5bd4 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 30 Oct 2019 22:09:14 -0400 Subject: net: dsa: remove ds->rtable Drivers do not use the ds->rtable static arrays anymore, get rid of it. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 3d7366d634d8..b46222adb5c2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -258,13 +258,6 @@ struct dsa_switch { */ const struct dsa_switch_ops *ops; - /* - * An array of which element [a] indicates which port on this - * switch should be used to send packets to that are destined - * for switch a. Can be NULL if there is only one switch chip. - */ - s8 rtable[DSA_MAX_SWITCHES]; - /* * Slave mii_bus and devices for the individual ports. */ -- cgit v1.2.3 From 9c8ad1ab66b577526a4c89e4a222e0fac431a2d6 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 30 Oct 2019 22:09:16 -0400 Subject: net: dsa: remove the dst->ds array Now that the DSA ports are listed in the switch fabric, there is no need to store the dsa_switch structures from the drivers in the fabric anymore. So get rid of the dst->ds static array. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index b46222adb5c2..e4c697b95c70 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -125,11 +125,6 @@ struct dsa_switch_tree { /* List of DSA links composing the routing table */ struct list_head rtable; - - /* - * Data for the individual switch chips. - */ - struct dsa_switch *ds[DSA_MAX_SWITCHES]; }; /* TC matchall action types, only mirroring for now */ -- cgit v1.2.3 From be0c5677970d4f21dc701136a178437aad9983b2 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 1 Nov 2019 14:46:37 +0200 Subject: net: bridge: fdb: br_fdb_update can take flags directly If we modify br_fdb_update() to take flags directly we can get rid of one test and one atomic bitop in the learning path. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/trace/events/bridge.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/trace/events/bridge.h b/include/trace/events/bridge.h index 8ea966448b58..6b200059c2c5 100644 --- a/include/trace/events/bridge.h +++ b/include/trace/events/bridge.h @@ -95,16 +95,16 @@ TRACE_EVENT(fdb_delete, TRACE_EVENT(br_fdb_update, TP_PROTO(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid, bool added_by_user), + const unsigned char *addr, u16 vid, unsigned long flags), - TP_ARGS(br, source, addr, vid, added_by_user), + TP_ARGS(br, source, addr, vid, flags), TP_STRUCT__entry( __string(br_dev, br->dev->name) __string(dev, source->dev->name) __array(unsigned char, addr, ETH_ALEN) __field(u16, vid) - __field(bool, added_by_user) + __field(unsigned long, flags) ), TP_fast_assign( @@ -112,14 +112,14 @@ TRACE_EVENT(br_fdb_update, __assign_str(dev, source->dev->name); memcpy(__entry->addr, addr, ETH_ALEN); __entry->vid = vid; - __entry->added_by_user = added_by_user; + __entry->flags = flags; ), - TP_printk("br_dev %s source %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u added_by_user %d", + TP_printk("br_dev %s source %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u flags 0x%lx", __get_str(br_dev), __get_str(dev), __entry->addr[0], __entry->addr[1], __entry->addr[2], __entry->addr[3], __entry->addr[4], __entry->addr[5], __entry->vid, - __entry->added_by_user) + __entry->flags) ); -- cgit v1.2.3 From e53a9d26cf80565cfb7172fc52a0dfac73613a0f Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 28 Oct 2019 23:35:30 +0000 Subject: IB/mlx5: Introduce and use mlx5_core_is_vf() Instead of deciding a given device is virtual function or not based on a device is PF or not, use already defined MLX5_COREDEV_VF by introducing an helper API mlx5_core_is_vf(). This enables to clearly identify PF, VF and non virtual functions. Signed-off-by: Parav Pandit Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 3e80f03a387f..7b4801e96feb 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1121,6 +1121,11 @@ static inline bool mlx5_core_is_pf(const struct mlx5_core_dev *dev) return dev->coredev_type == MLX5_COREDEV_PF; } +static inline bool mlx5_core_is_vf(const struct mlx5_core_dev *dev) +{ + return dev->coredev_type == MLX5_COREDEV_VF; +} + static inline bool mlx5_core_is_ecpf(struct mlx5_core_dev *dev) { return dev->caps.embedded_cpu; -- cgit v1.2.3 From d817991cc7486ab83f6c7188b0bc80eebee872f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 1 Nov 2019 12:03:46 +0100 Subject: xsk: Restructure/inline XSKMAP lookup/redirect/flush MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this commit the XSKMAP entry lookup function used by the XDP redirect code is moved from the xskmap.c file to the xdp_sock.h header, so the lookup can be inlined from, e.g., the bpf_xdp_redirect_map() function. Further the __xsk_map_redirect() and __xsk_map_flush() is moved to the xsk.c, which lets the compiler inline the xsk_rcv() and xsk_flush() functions. Finally, all the XDP socket functions were moved from linux/bpf.h to net/xdp_sock.h, where most of the XDP sockets functions are anyway. This yields a ~2% performance boost for the xdpsock "rx_drop" scenario. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191101110346.15004-4-bjorn.topel@gmail.com --- include/linux/bpf.h | 25 ------------------------- include/net/xdp_sock.h | 51 ++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 80158cff44bd..7c7f518811a6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1009,31 +1009,6 @@ static inline int sock_map_get_from_fd(const union bpf_attr *attr, } #endif -#if defined(CONFIG_XDP_SOCKETS) -struct xdp_sock; -struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key); -int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, - struct xdp_sock *xs); -void __xsk_map_flush(struct bpf_map *map); -#else -struct xdp_sock; -static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, - u32 key) -{ - return NULL; -} - -static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, - struct xdp_sock *xs) -{ - return -EOPNOTSUPP; -} - -static inline void __xsk_map_flush(struct bpf_map *map) -{ -} -#endif - #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) void bpf_sk_reuseport_detach(struct sock *sk); int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index c9398ce7960f..e3780e4b74e1 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -69,7 +69,14 @@ struct xdp_umem { /* Nodes are linked in the struct xdp_sock map_list field, and used to * track which maps a certain socket reside in. */ -struct xsk_map; + +struct xsk_map { + struct bpf_map map; + struct list_head __percpu *flush_list; + spinlock_t lock; /* Synchronize map updates */ + struct xdp_sock *xsk_map[]; +}; + struct xsk_map_node { struct list_head node; struct xsk_map *map; @@ -109,8 +116,6 @@ struct xdp_sock { struct xdp_buff; #ifdef CONFIG_XDP_SOCKETS int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); -int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); -void xsk_flush(struct xdp_sock *xs); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); /* Used from netdev driver */ bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); @@ -134,6 +139,22 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock **map_entry); int xsk_map_inc(struct xsk_map *map); void xsk_map_put(struct xsk_map *map); +int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs); +void __xsk_map_flush(struct bpf_map *map); + +static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, + u32 key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *xs; + + if (key >= map->max_entries) + return NULL; + + xs = READ_ONCE(m->xsk_map[key]); + return xs; +} static inline u64 xsk_umem_extract_addr(u64 addr) { @@ -224,15 +245,6 @@ static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) return -ENOTSUPP; } -static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) -{ - return -ENOTSUPP; -} - -static inline void xsk_flush(struct xdp_sock *xs) -{ -} - static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) { return false; @@ -357,6 +369,21 @@ static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle, return 0; } +static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs) +{ + return -EOPNOTSUPP; +} + +static inline void __xsk_map_flush(struct bpf_map *map) +{ +} + +static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, + u32 key) +{ + return NULL; +} #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ -- cgit v1.2.3 From 1d1585ca0f48fe7ed95c3571f3e4a82b2b5045dc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:17:56 +0100 Subject: uaccess: Add non-pagefault user-space write function Commit 3d7081822f7f ("uaccess: Add non-pagefault user-space read functions") missed to add probe write function, therefore factor out a probe_write_common() helper with most logic of probe_kernel_write() except setting KERNEL_DS, and add a new probe_user_write() helper so it can be used from BPF side. Again, on some archs, the user address space and kernel address space can co-exist and be overlapping, so in such case, setting KERNEL_DS would mean that the given address is treated as being in kernel address space. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: Masami Hiramatsu Link: https://lore.kernel.org/bpf/9df2542e68141bfa3addde631441ee45503856a8.1572649915.git.daniel@iogearbox.net --- include/linux/uaccess.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index d4ee6e942562..38555435a64a 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -337,6 +337,18 @@ extern long __probe_user_read(void *dst, const void __user *src, size_t size); extern long notrace probe_kernel_write(void *dst, const void *src, size_t size); extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size); +/* + * probe_user_write(): safely attempt to write to a location in user space + * @dst: address to write to + * @src: pointer to the data that shall be written + * @size: size of the data chunk + * + * Safely write to address @dst from the buffer at @src. If a kernel fault + * happens, handle that and return -EFAULT. + */ +extern long notrace probe_user_write(void __user *dst, const void *src, size_t size); +extern long notrace __probe_user_write(void __user *dst, const void *src, size_t size); + extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); extern long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr, long count); -- cgit v1.2.3 From 75a1a607bb7e6d918be3aca11ec2214a275392f4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:17:57 +0100 Subject: uaccess: Add strict non-pagefault kernel-space read function Add two new probe_kernel_read_strict() and strncpy_from_unsafe_strict() helpers which by default alias to the __probe_kernel_read() and the __strncpy_from_unsafe(), respectively, but can be overridden by archs which have non-overlapping address ranges for kernel space and user space in order to bail out with -EFAULT when attempting to probe user memory including non-canonical user access addresses [0]: 4-level page tables: user-space mem: 0x0000000000000000 - 0x00007fffffffffff non-canonical: 0x0000800000000000 - 0xffff7fffffffffff 5-level page tables: user-space mem: 0x0000000000000000 - 0x00ffffffffffffff non-canonical: 0x0100000000000000 - 0xfeffffffffffffff The idea is that these helpers are complementary to the probe_user_read() and strncpy_from_unsafe_user() which probe user-only memory. Both added helpers here do the same, but for kernel-only addresses. Both set of helpers are going to be used for BPF tracing. They also explicitly avoid throwing the splat for non-canonical user addresses from 00c42373d397 ("x86-64: add warning for non-canonical user access address dereferences"). For compat, the current probe_kernel_read() and strncpy_from_unsafe() are left as-is. [0] Documentation/x86/x86_64/mm.txt Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: x86@kernel.org Link: https://lore.kernel.org/bpf/eefeefd769aa5a013531f491a71f0936779e916b.1572649915.git.daniel@iogearbox.net --- include/linux/uaccess.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 38555435a64a..67f016010aad 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -311,6 +311,7 @@ copy_struct_from_user(void *dst, size_t ksize, const void __user *src, * happens, handle that and return -EFAULT. */ extern long probe_kernel_read(void *dst, const void *src, size_t size); +extern long probe_kernel_read_strict(void *dst, const void *src, size_t size); extern long __probe_kernel_read(void *dst, const void *src, size_t size); /* @@ -350,6 +351,9 @@ extern long notrace probe_user_write(void __user *dst, const void *src, size_t s extern long notrace __probe_user_write(void __user *dst, const void *src, size_t size); extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); +extern long strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr, + long count); +extern long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); extern long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr, long count); extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count); -- cgit v1.2.3 From 6ae08ae3dea2cfa03dd3665a3c8475c2d429ef47 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:17:59 +0100 Subject: bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers The current bpf_probe_read() and bpf_probe_read_str() helpers are broken in that they assume they can be used for probing memory access for kernel space addresses /as well as/ user space addresses. However, plain use of probe_kernel_read() for both cases will attempt to always access kernel space address space given access is performed under KERNEL_DS and some archs in-fact have overlapping address spaces where a kernel pointer and user pointer would have the /same/ address value and therefore accessing application memory via bpf_probe_read{,_str}() would read garbage values. Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess: Add non-pagefault user-space read functions"). Unfortunately, the only way to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}() and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}() helpers are kept as-is to retain their current behavior. The two *_user() variants attempt the access always under USER_DS set, the two *_kernel() variants will -EFAULT when accessing user memory if the underlying architecture has non-overlapping address ranges, also avoiding throwing the kernel warning via 00c42373d397 ("x86-64: add warning for non-canonical user access address dereferences"). Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper") Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net --- include/uapi/linux/bpf.h | 122 +++++++++++++++++++++++++++++++---------------- 1 file changed, 82 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a6bf19dabaab..df6809a76404 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -563,10 +563,13 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read(void *dst, u32 size, const void *src) + * int bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) * Description * For tracing programs, safely attempt to read *size* bytes from - * address *src* and store the data in *dst*. + * kernel space address *unsafe_ptr* and store the data in *dst*. + * + * Generally, use bpf_probe_read_user() or bpf_probe_read_kernel() + * instead. * Return * 0 on success, or a negative error in case of failure. * @@ -1428,45 +1431,14 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) + * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * Description - * Copy a NUL terminated string from an unsafe address - * *unsafe_ptr* to *dst*. The *size* should include the - * terminating NUL byte. In case the string length is smaller than - * *size*, the target is not padded with further NUL bytes. If the - * string length is larger than *size*, just *size*-1 bytes are - * copied and the last byte is set to NUL. - * - * On success, the length of the copied string is returned. This - * makes this helper useful in tracing programs for reading - * strings, and more importantly to get its length at runtime. See - * the following snippet: - * - * :: - * - * SEC("kprobe/sys_open") - * void bpf_sys_open(struct pt_regs *ctx) - * { - * char buf[PATHLEN]; // PATHLEN is defined to 256 - * int res = bpf_probe_read_str(buf, sizeof(buf), - * ctx->di); - * - * // Consume buf, for example push it to - * // userspace via bpf_perf_event_output(); we - * // can use res (the string length) as event - * // size, after checking its boundaries. - * } - * - * In comparison, using **bpf_probe_read()** helper here instead - * to read the string would require to estimate the length at - * compile time, and would often result in copying more memory - * than necessary. + * Copy a NUL terminated string from an unsafe kernel address + * *unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for + * more details. * - * Another useful use case is when parsing individual process - * arguments or individual environment variables navigating - * *current*\ **->mm->arg_start** and *current*\ - * **->mm->env_start**: using this helper and the return value, - * one can quickly iterate at the right offset of the memory area. + * Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str() + * instead. * Return * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative @@ -2777,6 +2749,72 @@ union bpf_attr { * restricted to raw_tracepoint bpf programs. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from user space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from kernel space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe user address + * *unsafe_ptr* to *dst*. The *size* should include the + * terminating NUL byte. In case the string length is smaller than + * *size*, the target is not padded with further NUL bytes. If the + * string length is larger than *size*, just *size*-1 bytes are + * copied and the last byte is set to NUL. + * + * On success, the length of the copied string is returned. This + * makes this helper useful in tracing programs for reading + * strings, and more importantly to get its length at runtime. See + * the following snippet: + * + * :: + * + * SEC("kprobe/sys_open") + * void bpf_sys_open(struct pt_regs *ctx) + * { + * char buf[PATHLEN]; // PATHLEN is defined to 256 + * int res = bpf_probe_read_user_str(buf, sizeof(buf), + * ctx->di); + * + * // Consume buf, for example push it to + * // userspace via bpf_perf_event_output(); we + * // can use res (the string length) as event + * // size, after checking its boundaries. + * } + * + * In comparison, using **bpf_probe_read_user()** helper here + * instead to read the string would require to estimate the length + * at compile time, and would often result in copying more memory + * than necessary. + * + * Another useful use case is when parsing individual process + * arguments or individual environment variables navigating + * *current*\ **->mm->arg_start** and *current*\ + * **->mm->env_start**: using this helper and the return value, + * one can quickly iterate at the right offset of the memory area. + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* + * to *dst*. Same semantics as with bpf_probe_read_user_str() apply. + * Return + * On success, the strictly positive length of the string, including + * the trailing NUL character. On error, a negative value. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2890,7 +2928,11 @@ union bpf_attr { FN(sk_storage_delete), \ FN(send_signal), \ FN(tcp_gen_syncookie), \ - FN(skb_output), + FN(skb_output), \ + FN(probe_read_user), \ + FN(probe_read_kernel), \ + FN(probe_read_user_str), \ + FN(probe_read_kernel_str), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 0c65b2b90d13c1deaee6449304dd367c5d4eb8ae Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 4 Nov 2019 02:40:33 +0100 Subject: net: of_get_phy_mode: Change API to solve int/unit warnings Before this change of_get_phy_mode() returned an enum, phy_interface_t. On error, -ENODEV etc, is returned. If the result of the function is stored in a variable of type phy_interface_t, and the compiler has decided to represent this as an unsigned int, comparision with -ENODEV etc, is a signed vs unsigned comparision. Fix this problem by changing the API. Make the function return an error, or 0 on success, and pass a pointer, of type phy_interface_t, where the phy mode should be stored. v2: Return with *interface set to PHY_INTERFACE_MODE_NA on error. Add error checks to all users of of_get_phy_mode() Fixup a few reverse christmas tree errors Fixup a few slightly malformed reverse christmas trees v3: Fix 0-day reported errors. Reported-by: Dan Carpenter Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/of_net.h | 7 +++++-- include/linux/stmmac.h | 3 ++- include/linux/sxgbe_platform.h | 4 +++- 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/of_net.h b/include/linux/of_net.h index 6aeaea1775e6..71bbfcf3adcd 100644 --- a/include/linux/of_net.h +++ b/include/linux/of_net.h @@ -6,15 +6,18 @@ #ifndef __LINUX_OF_NET_H #define __LINUX_OF_NET_H +#include + #ifdef CONFIG_OF_NET #include struct net_device; -extern int of_get_phy_mode(struct device_node *np); +extern int of_get_phy_mode(struct device_node *np, phy_interface_t *interface); extern const void *of_get_mac_address(struct device_node *np); extern struct net_device *of_find_net_device_by_node(struct device_node *np); #else -static inline int of_get_phy_mode(struct device_node *np) +static inline int of_get_phy_mode(struct device_node *np, + phy_interface_t *interface) { return -ENODEV; } diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 86f9464c3f5d..d4bcd9387136 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -13,6 +13,7 @@ #define __STMMAC_PLATFORM_DATA #include +#include #define MTL_MAX_RX_QUEUES 8 #define MTL_MAX_TX_QUEUES 8 @@ -132,7 +133,7 @@ struct plat_stmmacenet_data { int bus_id; int phy_addr; int interface; - int phy_interface; + phy_interface_t phy_interface; struct stmmac_mdio_bus_data *mdio_bus_data; struct device_node *phy_node; struct device_node *phylink_node; diff --git a/include/linux/sxgbe_platform.h b/include/linux/sxgbe_platform.h index 267369110584..85ec745767bd 100644 --- a/include/linux/sxgbe_platform.h +++ b/include/linux/sxgbe_platform.h @@ -10,6 +10,8 @@ #ifndef __SXGBE_PLATFORM_H__ #define __SXGBE_PLATFORM_H__ +#include + /* MDC Clock Selection define*/ #define SXGBE_CSR_100_150M 0x0 /* MDC = clk_scr_i/62 */ #define SXGBE_CSR_150_250M 0x1 /* MDC = clk_scr_i/102 */ @@ -38,7 +40,7 @@ struct sxgbe_plat_data { char *phy_bus_name; int bus_id; int phy_addr; - int interface; + phy_interface_t interface; struct sxgbe_mdio_bus_data *mdio_bus_data; struct sxgbe_dma_cfg *dma_cfg; int clk_csr; -- cgit v1.2.3 From b6520fce073b619e6f2c0d510bb3481c9386c70b Mon Sep 17 00:00:00 2001 From: Kristian Evensen Date: Thu, 26 Sep 2019 12:06:45 +0200 Subject: netfilter: ipset: Add wildcard support to net,iface The net,iface equal functions currently compares the full interface names. In several cases, wildcard (or prefix) matching is useful. For example, when converting a large iptables rule-set to make use of ipset, I was able to significantly reduce the number of set elements by making use of wildcard matching. Wildcard matching is enabled by adding "wildcard" when adding an element to a set. Internally, this causes the IPSET_FLAG_IFACE_WILDCARD-flag to be set. When this flag is set, only the initial part of the interface name is used for comparison. Wildcard matching is done per element and not per set, as there are many cases where mixing wildcard and non-wildcard elements are useful. This means that is up to the user to handle (avoid) overlapping interface names. Signed-off-by: Kristian Evensen Signed-off-by: Jozsef Kadlecsik --- include/uapi/linux/netfilter/ipset/ip_set.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index eea166c52c36..11a72a938eb1 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -205,6 +205,8 @@ enum ipset_cadt_flags { IPSET_FLAG_WITH_FORCEADD = (1 << IPSET_FLAG_BIT_WITH_FORCEADD), IPSET_FLAG_BIT_WITH_SKBINFO = 6, IPSET_FLAG_WITH_SKBINFO = (1 << IPSET_FLAG_BIT_WITH_SKBINFO), + IPSET_FLAG_BIT_IFACE_WILDCARD = 7, + IPSET_FLAG_IFACE_WILDCARD = (1 << IPSET_FLAG_BIT_IFACE_WILDCARD), IPSET_FLAG_CADT_MAX = 15, }; -- cgit v1.2.3 From 15122464d525f684a61806d28597050cdcef0f32 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Sat, 2 Nov 2019 01:12:03 +0100 Subject: icmp: add helpers to recognize ICMP error packets Add two helper functions, one for IPv4 and one for IPv6, to recognize the ICMP packets which are error responses. This packets are special because they have as payload the original header of the packet which generated it (RFC 792 says at least 8 bytes, but Linux actually includes much more than that). Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- include/linux/icmp.h | 15 +++++++++++++++ include/linux/icmpv6.h | 14 ++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/icmp.h b/include/linux/icmp.h index 2d8aaf7d4b9e..81ca84ce3119 100644 --- a/include/linux/icmp.h +++ b/include/linux/icmp.h @@ -20,4 +20,19 @@ static inline struct icmphdr *icmp_hdr(const struct sk_buff *skb) { return (struct icmphdr *)skb_transport_header(skb); } + +static inline bool icmp_is_err(int type) +{ + switch (type) { + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_REDIRECT: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + return true; + } + + return false; +} + #endif /* _LINUX_ICMP_H */ diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h index a8f888976137..ef1cbb5f454f 100644 --- a/include/linux/icmpv6.h +++ b/include/linux/icmpv6.h @@ -46,4 +46,18 @@ extern void icmpv6_flow_init(struct sock *sk, const struct in6_addr *saddr, const struct in6_addr *daddr, int oif); + +static inline bool icmpv6_is_err(int type) +{ + switch (type) { + case ICMPV6_DEST_UNREACH: + case ICMPV6_PKT_TOOBIG: + case ICMPV6_TIME_EXCEED: + case ICMPV6_PARAMPROB: + return true; + } + + return false; +} + #endif -- cgit v1.2.3 From 5cd73fbd78794d9c9c4e7a61dc8fa83489b43d03 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 5 Nov 2019 01:12:57 +0100 Subject: net: dsa: Add support for devlink resources Add wrappers around the devlink resource API, so that DSA drivers can register and unregister devlink resources. Signed-off-by: Andrew Lunn Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/dsa.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index e4c697b95c70..9507611a41f0 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -586,6 +586,22 @@ int dsa_devlink_params_register(struct dsa_switch *ds, void dsa_devlink_params_unregister(struct dsa_switch *ds, const struct devlink_param *params, size_t params_count); +int dsa_devlink_resource_register(struct dsa_switch *ds, + const char *resource_name, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + const struct devlink_resource_size_params *size_params); + +void dsa_devlink_resources_unregister(struct dsa_switch *ds); + +void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds, + u64 resource_id, + devlink_resource_occ_get_t *occ_get, + void *occ_get_priv); +void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, + u64 resource_id); + struct dsa_devlink_priv { struct dsa_switch *ds; }; -- cgit v1.2.3 From 4d390c287b2f3fbd0bb64c52c1a9418f790986e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Nov 2019 19:13:13 -0800 Subject: net_sched: do not export gnet_stats_basic_packed to uapi gnet_stats_basic_packed was really meant to be private kernel structure. If this proves to be a problem, we will have to rename the in-kernel version. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/gen_stats.h | 6 ++++++ include/uapi/linux/gen_stats.h | 4 ---- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index ca23860adbb9..5f3889e7ec1b 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -7,6 +7,12 @@ #include #include +/* Note: this used to be in include/uapi/linux/gen_stats.h */ +struct gnet_stats_basic_packed { + __u64 bytes; + __u32 packets; +} __attribute__ ((packed)); + struct gnet_stats_basic_cpu { struct gnet_stats_basic_packed bstats; struct u64_stats_sync syncp; diff --git a/include/uapi/linux/gen_stats.h b/include/uapi/linux/gen_stats.h index 065408e16a80..4eaacdf452e3 100644 --- a/include/uapi/linux/gen_stats.h +++ b/include/uapi/linux/gen_stats.h @@ -26,10 +26,6 @@ struct gnet_stats_basic { __u64 bytes; __u32 packets; }; -struct gnet_stats_basic_packed { - __u64 bytes; - __u32 packets; -} __attribute__ ((packed)); /** * struct gnet_stats_rate_est - rate estimator -- cgit v1.2.3 From d0083d98f685b9f4fe810570f93cef0b0bb6b354 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Nov 2019 19:13:14 -0800 Subject: net_sched: extend packet counter to 64bit After this change, qdisc packet counter is no longer a 32bit quantity. We still export 32bit values to user. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/gen_stats.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index 5f3889e7ec1b..1424e02cef90 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -10,8 +10,8 @@ /* Note: this used to be in include/uapi/linux/gen_stats.h */ struct gnet_stats_basic_packed { __u64 bytes; - __u32 packets; -} __attribute__ ((packed)); + __u64 packets; +}; struct gnet_stats_basic_cpu { struct gnet_stats_basic_packed bstats; -- cgit v1.2.3 From b33e699fe43aa63f29113311f69357e119ef5276 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Nov 2019 19:13:15 -0800 Subject: net_sched: add TCA_STATS_PKT64 attribute Now the kernel uses 64bit packet counters in scheduler layer, we want to export these counters to user space. Instead risking breaking user space by adding fields to struct gnet_stats_basic, add a new TCA_STATS_PKT64. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/gen_stats.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/gen_stats.h b/include/uapi/linux/gen_stats.h index 4eaacdf452e3..852f234f1fd6 100644 --- a/include/uapi/linux/gen_stats.h +++ b/include/uapi/linux/gen_stats.h @@ -13,6 +13,7 @@ enum { TCA_STATS_RATE_EST64, TCA_STATS_PAD, TCA_STATS_BASIC_HW, + TCA_STATS_PKT64, __TCA_STATS_MAX, }; #define TCA_STATS_MAX (__TCA_STATS_MAX - 1) -- cgit v1.2.3 From 25c7a6d1f90e208ec27ca854b1381ed39842ec57 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Nov 2019 14:11:51 -0800 Subject: net: avoid potential false sharing in neighbor related code There are common instances of the following construct : if (n->confirmed != now) n->confirmed = now; A C compiler could legally remove the conditional. Use READ_ONCE()/WRITE_ONCE() to avoid this problem. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/arp.h | 4 ++-- include/net/ndisc.h | 8 ++++---- include/net/sock.h | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/arp.h b/include/net/arp.h index c8f580a0e6b1..4950191f6b2b 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -57,8 +57,8 @@ static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key) unsigned long now = jiffies; /* avoid dirtying neighbour */ - if (n->confirmed != now) - n->confirmed = now; + if (READ_ONCE(n->confirmed) != now) + WRITE_ONCE(n->confirmed, now); } rcu_read_unlock_bh(); } diff --git a/include/net/ndisc.h b/include/net/ndisc.h index b2f715ca0567..b5ebeb3b0de0 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -414,8 +414,8 @@ static inline void __ipv6_confirm_neigh(struct net_device *dev, unsigned long now = jiffies; /* avoid dirtying neighbour */ - if (n->confirmed != now) - n->confirmed = now; + if (READ_ONCE(n->confirmed) != now) + WRITE_ONCE(n->confirmed, now); } rcu_read_unlock_bh(); } @@ -431,8 +431,8 @@ static inline void __ipv6_confirm_neigh_stub(struct net_device *dev, unsigned long now = jiffies; /* avoid dirtying neighbour */ - if (n->confirmed != now) - n->confirmed = now; + if (READ_ONCE(n->confirmed) != now) + WRITE_ONCE(n->confirmed, now); } rcu_read_unlock_bh(); } diff --git a/include/net/sock.h b/include/net/sock.h index ac6042d0af32..f2f853439b65 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1939,8 +1939,8 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); static inline void sk_dst_confirm(struct sock *sk) { - if (!sk->sk_dst_pending_confirm) - sk->sk_dst_pending_confirm = 1; + if (!READ_ONCE(sk->sk_dst_pending_confirm)) + WRITE_ONCE(sk->sk_dst_pending_confirm, 1); } static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n) @@ -1950,10 +1950,10 @@ static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n) unsigned long now = jiffies; /* avoid dirtying neighbour */ - if (n->confirmed != now) - n->confirmed = now; - if (sk && sk->sk_dst_pending_confirm) - sk->sk_dst_pending_confirm = 0; + if (READ_ONCE(n->confirmed) != now) + WRITE_ONCE(n->confirmed, now); + if (sk && READ_ONCE(sk->sk_dst_pending_confirm)) + WRITE_ONCE(sk->sk_dst_pending_confirm, 0); } } -- cgit v1.2.3 From 288efe8606b62d0753ba6722b36ef241877251fd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Nov 2019 14:11:53 -0800 Subject: net: annotate lockless accesses to sk->sk_ack_backlog sk->sk_ack_backlog can be read without any lock being held. We need to use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing and/or potential KCSAN warnings. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index f2f853439b65..a126784aa7d9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -859,17 +859,17 @@ static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask) static inline void sk_acceptq_removed(struct sock *sk) { - sk->sk_ack_backlog--; + WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1); } static inline void sk_acceptq_added(struct sock *sk) { - sk->sk_ack_backlog++; + WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1); } static inline bool sk_acceptq_is_full(const struct sock *sk) { - return sk->sk_ack_backlog > sk->sk_max_ack_backlog; + return READ_ONCE(sk->sk_ack_backlog) > sk->sk_max_ack_backlog; } /* -- cgit v1.2.3 From 099ecf59f05b5f30f42ebac0ab8cb94f9b18c90c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Nov 2019 14:11:54 -0800 Subject: net: annotate lockless accesses to sk->sk_max_ack_backlog sk->sk_max_ack_backlog can be read without any lock being held at least in TCP/DCCP cases. We need to use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing and/or potential KCSAN warnings. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index a126784aa7d9..d4d3ef5ba049 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -869,7 +869,7 @@ static inline void sk_acceptq_added(struct sock *sk) static inline bool sk_acceptq_is_full(const struct sock *sk) { - return READ_ONCE(sk->sk_ack_backlog) > sk->sk_max_ack_backlog; + return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog); } /* -- cgit v1.2.3 From 4ece477870774698e6e73d5821a3dd1605ca123b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 6 Nov 2019 17:01:05 +0800 Subject: lwtunnel: add options setting and dumping for geneve To add options setting and dumping, .build_state(), .fill_encap() and .get_encap_size() in ip_tun_lwt_ops needs to be extended: ip_tun_build_state(): ip_tun_parse_opts(): ip_tun_parse_opts_geneve() ip_tun_fill_encap_info(): ip_tun_fill_encap_opts(): ip_tun_fill_encap_opts_geneve() ip_tun_encap_nlsize() ip_tun_opts_nlsize(): if (tun_flags & TUNNEL_GENEVE_OPT) ip_tun_parse_opts(), ip_tun_fill_encap_opts() and ip_tun_opts_nlsize() processes LWTUNNEL_IP_OPTS. ip_tun_parse_opts_geneve(), ip_tun_fill_encap_opts_geneve() and if (tun_flags & TUNNEL_GENEVE_OPT) processes LWTUNNEL_IP_OPTS_GENEVE. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/lwtunnel.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index de696ca12f2c..b595ab219036 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -27,6 +27,7 @@ enum lwtunnel_ip_t { LWTUNNEL_IP_TOS, LWTUNNEL_IP_FLAGS, LWTUNNEL_IP_PAD, + LWTUNNEL_IP_OPTS, __LWTUNNEL_IP_MAX, }; @@ -41,11 +42,30 @@ enum lwtunnel_ip6_t { LWTUNNEL_IP6_TC, LWTUNNEL_IP6_FLAGS, LWTUNNEL_IP6_PAD, + LWTUNNEL_IP6_OPTS, __LWTUNNEL_IP6_MAX, }; #define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1) +enum { + LWTUNNEL_IP_OPTS_UNSPEC, + LWTUNNEL_IP_OPTS_GENEVE, + __LWTUNNEL_IP_OPTS_MAX, +}; + +#define LWTUNNEL_IP_OPTS_MAX (__LWTUNNEL_IP_OPTS_MAX - 1) + +enum { + LWTUNNEL_IP_OPT_GENEVE_UNSPEC, + LWTUNNEL_IP_OPT_GENEVE_CLASS, + LWTUNNEL_IP_OPT_GENEVE_TYPE, + LWTUNNEL_IP_OPT_GENEVE_DATA, + __LWTUNNEL_IP_OPT_GENEVE_MAX, +}; + +#define LWTUNNEL_IP_OPT_GENEVE_MAX (__LWTUNNEL_IP_OPT_GENEVE_MAX - 1) + enum { LWT_BPF_PROG_UNSPEC, LWT_BPF_PROG_FD, -- cgit v1.2.3 From edf31cbb1502481da181a09148adb33e12599185 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 6 Nov 2019 17:01:06 +0800 Subject: lwtunnel: add options setting and dumping for vxlan Based on the code framework built on the last patch, to support setting and dumping for vxlan, we only need to add ip_tun_parse_opts_vxlan() for .build_state and ip_tun_fill_encap_opts_vxlan() for .fill_encap and if (tun_flags & TUNNEL_VXLAN_OPT) for .get_encap_size. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/lwtunnel.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index b595ab219036..638b7b108453 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -51,6 +51,7 @@ enum lwtunnel_ip6_t { enum { LWTUNNEL_IP_OPTS_UNSPEC, LWTUNNEL_IP_OPTS_GENEVE, + LWTUNNEL_IP_OPTS_VXLAN, __LWTUNNEL_IP_OPTS_MAX, }; @@ -66,6 +67,14 @@ enum { #define LWTUNNEL_IP_OPT_GENEVE_MAX (__LWTUNNEL_IP_OPT_GENEVE_MAX - 1) +enum { + LWTUNNEL_IP_OPT_VXLAN_UNSPEC, + LWTUNNEL_IP_OPT_VXLAN_GBP, + __LWTUNNEL_IP_OPT_VXLAN_MAX, +}; + +#define LWTUNNEL_IP_OPT_VXLAN_MAX (__LWTUNNEL_IP_OPT_VXLAN_MAX - 1) + enum { LWT_BPF_PROG_UNSPEC, LWT_BPF_PROG_FD, -- cgit v1.2.3 From b0a21810bd5e1f92e3379899cc8ca9fe144ee8b3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 6 Nov 2019 17:01:07 +0800 Subject: lwtunnel: add options setting and dumping for erspan Based on the code framework built on the last patch, to support setting and dumping for vxlan, we only need to add ip_tun_parse_opts_erspan() for .build_state and ip_tun_fill_encap_opts_erspan() for .fill_encap and if (tun_flags & TUNNEL_ERSPAN_OPT) for .get_encap_size. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/lwtunnel.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index 638b7b108453..f6035f737193 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -52,6 +52,7 @@ enum { LWTUNNEL_IP_OPTS_UNSPEC, LWTUNNEL_IP_OPTS_GENEVE, LWTUNNEL_IP_OPTS_VXLAN, + LWTUNNEL_IP_OPTS_ERSPAN, __LWTUNNEL_IP_OPTS_MAX, }; @@ -75,6 +76,17 @@ enum { #define LWTUNNEL_IP_OPT_VXLAN_MAX (__LWTUNNEL_IP_OPT_VXLAN_MAX - 1) +enum { + LWTUNNEL_IP_OPT_ERSPAN_UNSPEC, + LWTUNNEL_IP_OPT_ERSPAN_VER, + LWTUNNEL_IP_OPT_ERSPAN_INDEX, + LWTUNNEL_IP_OPT_ERSPAN_DIR, + LWTUNNEL_IP_OPT_ERSPAN_HWID, + __LWTUNNEL_IP_OPT_ERSPAN_MAX, +}; + +#define LWTUNNEL_IP_OPT_ERSPAN_MAX (__LWTUNNEL_IP_OPT_ERSPAN_MAX - 1) + enum { LWT_BPF_PROG_UNSPEC, LWT_BPF_PROG_FD, -- cgit v1.2.3 From 9ed498c6280a2f2b51d02df96df53037272ede49 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Nov 2019 10:04:11 -0800 Subject: net: silence data-races on sk_backlog.tail sk->sk_backlog.tail might be read without holding the socket spinlock, we need to add proper READ_ONCE()/WRITE_ONCE() to silence the warnings. KCSAN reported : BUG: KCSAN: data-race in tcp_add_backlog / tcp_recvmsg write to 0xffff8881265109f8 of 8 bytes by interrupt on cpu 1: __sk_add_backlog include/net/sock.h:907 [inline] sk_add_backlog include/net/sock.h:938 [inline] tcp_add_backlog+0x476/0xce0 net/ipv4/tcp_ipv4.c:1759 tcp_v4_rcv+0x1a70/0x1bd0 net/ipv4/tcp_ipv4.c:1947 ip_protocol_deliver_rcu+0x4d/0x420 net/ipv4/ip_input.c:204 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:442 [inline] ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:4929 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5043 netif_receive_skb_internal+0x59/0x190 net/core/dev.c:5133 napi_skb_finish net/core/dev.c:5596 [inline] napi_gro_receive+0x28f/0x330 net/core/dev.c:5629 receive_buf+0x284/0x30b0 drivers/net/virtio_net.c:1061 virtnet_receive drivers/net/virtio_net.c:1323 [inline] virtnet_poll+0x436/0x7d0 drivers/net/virtio_net.c:1428 napi_poll net/core/dev.c:6311 [inline] net_rx_action+0x3ae/0xa90 net/core/dev.c:6379 __do_softirq+0x115/0x33f kernel/softirq.c:292 invoke_softirq kernel/softirq.c:373 [inline] irq_exit+0xbb/0xe0 kernel/softirq.c:413 exiting_irq arch/x86/include/asm/apic.h:536 [inline] do_IRQ+0xa6/0x180 arch/x86/kernel/irq.c:263 ret_from_intr+0x0/0x19 native_safe_halt+0xe/0x10 arch/x86/kernel/paravirt.c:71 arch_cpu_idle+0x1f/0x30 arch/x86/kernel/process.c:571 default_idle_call+0x1e/0x40 kernel/sched/idle.c:94 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x1af/0x280 kernel/sched/idle.c:263 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:355 start_secondary+0x208/0x260 arch/x86/kernel/smpboot.c:264 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241 read to 0xffff8881265109f8 of 8 bytes by task 8057 on cpu 0: tcp_recvmsg+0x46e/0x1b40 net/ipv4/tcp.c:2050 inet_recvmsg+0xbb/0x250 net/ipv4/af_inet.c:838 sock_recvmsg_nosec net/socket.c:871 [inline] sock_recvmsg net/socket.c:889 [inline] sock_recvmsg+0x92/0xb0 net/socket.c:885 sock_read_iter+0x15f/0x1e0 net/socket.c:967 call_read_iter include/linux/fs.h:1889 [inline] new_sync_read+0x389/0x4f0 fs/read_write.c:414 __vfs_read+0xb1/0xc0 fs/read_write.c:427 vfs_read fs/read_write.c:461 [inline] vfs_read+0x143/0x2c0 fs/read_write.c:446 ksys_read+0xd5/0x1b0 fs/read_write.c:587 __do_sys_read fs/read_write.c:597 [inline] __se_sys_read fs/read_write.c:595 [inline] __x64_sys_read+0x4c/0x60 fs/read_write.c:595 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 8057 Comm: syz-fuzzer Not tainted 5.4.0-rc6+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index d4d3ef5ba049..bd210c78dc9d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -899,11 +899,11 @@ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) skb_dst_force(skb); if (!sk->sk_backlog.tail) - sk->sk_backlog.head = skb; + WRITE_ONCE(sk->sk_backlog.head, skb); else sk->sk_backlog.tail->next = skb; - sk->sk_backlog.tail = skb; + WRITE_ONCE(sk->sk_backlog.tail, skb); skb->next = NULL; } -- cgit v1.2.3 From 2c63221cd9e5c0dad0424029aeb1c40faada8330 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Wed, 6 Nov 2019 23:36:13 +0100 Subject: dt-bindings: net: phy: Add support for AT803X Document the Atheros AR803x PHY bindings. Signed-off-by: Michael Walle Reviewed-by: Florian Fainelli Reviewed-by: Rob Herring Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/dt-bindings/net/qca-ar803x.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 include/dt-bindings/net/qca-ar803x.h (limited to 'include') diff --git a/include/dt-bindings/net/qca-ar803x.h b/include/dt-bindings/net/qca-ar803x.h new file mode 100644 index 000000000000..9c046c7242ed --- /dev/null +++ b/include/dt-bindings/net/qca-ar803x.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Device Tree constants for the Qualcomm Atheros AR803x PHYs + */ + +#ifndef _DT_BINDINGS_QCA_AR803X_H +#define _DT_BINDINGS_QCA_AR803X_H + +#define AR803X_STRENGTH_FULL 0 +#define AR803X_STRENGTH_HALF 1 +#define AR803X_STRENGTH_QUARTER 2 + +#endif -- cgit v1.2.3 From 200ecef67b8d09d16ec55f91c92751dcc7a38d40 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 11:51:18 -0800 Subject: tcp: Remove one extra ktime_get_ns() from cookie_init_timestamp tcp_make_synack() already uses tcp_clock_ns(), and can pass the value to cookie_init_timestamp() to avoid another call to ktime_get_ns() helper. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index ab4eb5eb5d07..36f195fb576a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -537,7 +537,7 @@ static inline u32 tcp_cookie_time(void) u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); -u64 cookie_init_timestamp(struct request_sock *req); +u64 cookie_init_timestamp(struct request_sock *req, u64 now); bool cookie_timestamp_decode(const struct net *net, struct tcp_options_received *opt); bool cookie_ecn_ok(const struct tcp_options_received *opt, @@ -757,10 +757,16 @@ static inline u32 tcp_time_stamp(const struct tcp_sock *tp) return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ); } +/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */ +static inline u32 tcp_ns_to_ts(u64 ns) +{ + return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ); +} + /* Could use tcp_clock_us() / 1000, but this version uses a single divide */ static inline u32 tcp_time_stamp_raw(void) { - return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ); + return tcp_ns_to_ts(tcp_clock_ns()); } void tcp_mstamp_refresh(struct tcp_sock *tp); @@ -772,7 +778,7 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) { - return div_u64(skb->skb_mstamp_ns, NSEC_PER_SEC / TCP_TS_HZ); + return tcp_ns_to_ts(skb->skb_mstamp_ns); } /* provide the departure time in us unit */ -- cgit v1.2.3 From 6896cc4d8fe6fe6163d6f0baa02a270da68896e8 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Thu, 7 Nov 2019 18:42:09 +0200 Subject: devlink: Add layer 3 generic packet traps Add packet traps that can report packets that were dropped during layer 3 forwarding. Signed-off-by: Amit Cohen Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/devlink.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 6bf3b9e0595a..df7814d55bf9 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -569,6 +569,15 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_BLACKHOLE_ROUTE, DEVLINK_TRAP_GENERIC_ID_TTL_ERROR, DEVLINK_TRAP_GENERIC_ID_TAIL_DROP, + DEVLINK_TRAP_GENERIC_ID_NON_IP_PACKET, + DEVLINK_TRAP_GENERIC_ID_UC_DIP_MC_DMAC, + DEVLINK_TRAP_GENERIC_ID_DIP_LB, + DEVLINK_TRAP_GENERIC_ID_SIP_MC, + DEVLINK_TRAP_GENERIC_ID_SIP_LB, + DEVLINK_TRAP_GENERIC_ID_CORRUPTED_IP_HDR, + DEVLINK_TRAP_GENERIC_ID_IPV4_SIP_BC, + DEVLINK_TRAP_GENERIC_ID_IPV6_MC_DIP_RESERVED_SCOPE, + DEVLINK_TRAP_GENERIC_ID_IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -607,6 +616,24 @@ enum devlink_trap_group_generic_id { "ttl_value_is_too_small" #define DEVLINK_TRAP_GENERIC_NAME_TAIL_DROP \ "tail_drop" +#define DEVLINK_TRAP_GENERIC_NAME_NON_IP_PACKET \ + "non_ip" +#define DEVLINK_TRAP_GENERIC_NAME_UC_DIP_MC_DMAC \ + "uc_dip_over_mc_dmac" +#define DEVLINK_TRAP_GENERIC_NAME_DIP_LB \ + "dip_is_loopback_address" +#define DEVLINK_TRAP_GENERIC_NAME_SIP_MC \ + "sip_is_mc" +#define DEVLINK_TRAP_GENERIC_NAME_SIP_LB \ + "sip_is_loopback_address" +#define DEVLINK_TRAP_GENERIC_NAME_CORRUPTED_IP_HDR \ + "ip_header_corrupted" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_SIP_BC \ + "ipv4_sip_is_limited_bc" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_MC_DIP_RESERVED_SCOPE \ + "ipv6_mc_dip_reserved_scope" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE \ + "ipv6_mc_dip_interface_local_scope" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" -- cgit v1.2.3 From 3b063ae57bdfec5e574ace440e6c3f34c4115a92 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Thu, 7 Nov 2019 18:42:14 +0200 Subject: devlink: Add layer 3 generic packet exception traps Add layer 3 generic packet exception traps that can report trapped packets and documentation of the traps. Unlike drop traps, these exception traps also need to inject the packet to the kernel's receive path. For example, a packet that was trapped due to unreachable neighbour need to be injected into the kernel so that it will trigger an ARP request or a neighbour solicitation message. Signed-off-by: Amit Cohen Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/devlink.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index df7814d55bf9..8d6b5846822c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -578,6 +578,12 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_IPV4_SIP_BC, DEVLINK_TRAP_GENERIC_ID_IPV6_MC_DIP_RESERVED_SCOPE, DEVLINK_TRAP_GENERIC_ID_IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, + DEVLINK_TRAP_GENERIC_ID_MTU_ERROR, + DEVLINK_TRAP_GENERIC_ID_UNRESOLVED_NEIGH, + DEVLINK_TRAP_GENERIC_ID_RPF, + DEVLINK_TRAP_GENERIC_ID_REJECT_ROUTE, + DEVLINK_TRAP_GENERIC_ID_IPV4_LPM_UNICAST_MISS, + DEVLINK_TRAP_GENERIC_ID_IPV6_LPM_UNICAST_MISS, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -634,6 +640,18 @@ enum devlink_trap_group_generic_id { "ipv6_mc_dip_reserved_scope" #define DEVLINK_TRAP_GENERIC_NAME_IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE \ "ipv6_mc_dip_interface_local_scope" +#define DEVLINK_TRAP_GENERIC_NAME_MTU_ERROR \ + "mtu_value_is_too_small" +#define DEVLINK_TRAP_GENERIC_NAME_UNRESOLVED_NEIGH \ + "unresolved_neigh" +#define DEVLINK_TRAP_GENERIC_NAME_RPF \ + "mc_reverse_path_forwarding" +#define DEVLINK_TRAP_GENERIC_NAME_REJECT_ROUTE \ + "reject_route" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_LPM_UNICAST_MISS \ + "ipv4_lpm_miss" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_LPM_UNICAST_MISS \ + "ipv6_lpm_miss" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" -- cgit v1.2.3 From de7d5084d82794a8e83afb994fcb07f82da3cd7b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 16:27:14 -0800 Subject: net: provide dev_lstats_read() helper Many network drivers use hand-coded implementation of the same thing, let's factorize things so that u64_stats_t adoption is done once. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1f140a6b66df..75561992c31f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2401,6 +2401,8 @@ struct pcpu_lstats { struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); +void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes); + #define __netdev_alloc_pcpu_stats(type, gfp) \ ({ \ typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\ -- cgit v1.2.3 From dd5382a08157756510aa8d7269c662eccde775cb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 16:27:15 -0800 Subject: net: provide dev_lstats_add() helper Many network drivers need it and hand-coded the same function. In order to ease u64_stats_t adoption, it is time to factorize. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 75561992c31f..461a36220cf4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2403,6 +2403,16 @@ struct pcpu_lstats { void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes); +static inline void dev_lstats_add(struct net_device *dev, unsigned int len) +{ + struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats); + + u64_stats_update_begin(&lstats->syncp); + lstats->bytes += len; + lstats->packets++; + u64_stats_update_end(&lstats->syncp); +} + #define __netdev_alloc_pcpu_stats(type, gfp) \ ({ \ typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\ -- cgit v1.2.3 From 316580b69d0a7aeeee5063af47438b626bc47cbd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 16:27:20 -0800 Subject: u64_stats: provide u64_stats_t type On 64bit arches, struct u64_stats_sync is empty and provides no help against load/store tearing. Using READ_ONCE()/WRITE_ONCE() would be needed. But the update side would be slightly more expensive. local64_t was defined so that we could use regular adds in a manner which is atomic wrt IRQs. However the u64_stats infra means we do not have to use local64_t on 32bit arches since the syncp provides the needed protection. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/u64_stats_sync.h | 51 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index a27604f99ed0..9de5c10293f5 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -40,8 +40,8 @@ * spin_lock_bh(...) or other synchronization to get exclusive access * ... * u64_stats_update_begin(&stats->syncp); - * stats->bytes64 += len; // non atomic operation - * stats->packets64++; // non atomic operation + * u64_stats_add(&stats->bytes64, len); // non atomic operation + * u64_stats_inc(&stats->packets64); // non atomic operation * u64_stats_update_end(&stats->syncp); * * While a consumer (reader) should use following template to get consistent @@ -52,8 +52,8 @@ * * do { * start = u64_stats_fetch_begin(&stats->syncp); - * tbytes = stats->bytes64; // non atomic operation - * tpackets = stats->packets64; // non atomic operation + * tbytes = u64_stats_read(&stats->bytes64); // non atomic operation + * tpackets = u64_stats_read(&stats->packets64); // non atomic operation * } while (u64_stats_fetch_retry(&stats->syncp, start)); * * @@ -68,6 +68,49 @@ struct u64_stats_sync { #endif }; +#if BITS_PER_LONG == 64 +#include + +typedef struct { + local64_t v; +} u64_stats_t ; + +static inline u64 u64_stats_read(const u64_stats_t *p) +{ + return local64_read(&p->v); +} + +static inline void u64_stats_add(u64_stats_t *p, unsigned long val) +{ + local64_add(val, &p->v); +} + +static inline void u64_stats_inc(u64_stats_t *p) +{ + local64_inc(&p->v); +} + +#else + +typedef struct { + u64 v; +} u64_stats_t; + +static inline u64 u64_stats_read(const u64_stats_t *p) +{ + return p->v; +} + +static inline void u64_stats_add(u64_stats_t *p, unsigned long val) +{ + p->v += val; +} + +static inline void u64_stats_inc(u64_stats_t *p) +{ + p->v++; +} +#endif static inline void u64_stats_init(struct u64_stats_sync *syncp) { -- cgit v1.2.3 From fd2f4737870eb866537fbbffa2b59414b9b0c0a2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 16:27:22 -0800 Subject: net: use u64_stats_t in struct pcpu_lstats In order to fix the data-race found by KCSAN, we can use the new u64_stats_t type and its accessors instead of plain u64 fields. This will still generate optimal code for both 32 and 64 bit platforms. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 461a36220cf4..f857f01234f7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2396,8 +2396,8 @@ struct pcpu_sw_netstats { } __aligned(4 * sizeof(u64)); struct pcpu_lstats { - u64 packets; - u64 bytes; + u64_stats_t packets; + u64_stats_t bytes; struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); @@ -2408,8 +2408,8 @@ static inline void dev_lstats_add(struct net_device *dev, unsigned int len) struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats); u64_stats_update_begin(&lstats->syncp); - lstats->bytes += len; - lstats->packets++; + u64_stats_add(&lstats->bytes, len); + u64_stats_inc(&lstats->packets); u64_stats_update_end(&lstats->syncp); } -- cgit v1.2.3 From c305c6ae79e2ce20c22660ceda94f0d86d639a82 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 18:29:11 -0800 Subject: net: add annotations on hh->hh_len lockless accesses KCSAN reported a data-race [1] While we can use READ_ONCE() on the read sides, we need to make sure hh->hh_len is written last. [1] BUG: KCSAN: data-race in eth_header_cache / neigh_resolve_output write to 0xffff8880b9dedcb8 of 4 bytes by task 29760 on cpu 0: eth_header_cache+0xa9/0xd0 net/ethernet/eth.c:247 neigh_hh_init net/core/neighbour.c:1463 [inline] neigh_resolve_output net/core/neighbour.c:1480 [inline] neigh_resolve_output+0x415/0x470 net/core/neighbour.c:1470 neigh_output include/net/neighbour.h:511 [inline] ip6_finish_output2+0x7a2/0xec0 net/ipv6/ip6_output.c:116 __ip6_finish_output net/ipv6/ip6_output.c:142 [inline] __ip6_finish_output+0x2d7/0x330 net/ipv6/ip6_output.c:127 ip6_finish_output+0x41/0x160 net/ipv6/ip6_output.c:152 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip6_output+0xf2/0x280 net/ipv6/ip6_output.c:175 dst_output include/net/dst.h:436 [inline] NF_HOOK include/linux/netfilter.h:305 [inline] ndisc_send_skb+0x459/0x5f0 net/ipv6/ndisc.c:505 ndisc_send_ns+0x207/0x430 net/ipv6/ndisc.c:647 rt6_probe_deferred+0x98/0xf0 net/ipv6/route.c:615 process_one_work+0x3d4/0x890 kernel/workqueue.c:2269 worker_thread+0xa0/0x800 kernel/workqueue.c:2415 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 read to 0xffff8880b9dedcb8 of 4 bytes by task 29572 on cpu 1: neigh_resolve_output net/core/neighbour.c:1479 [inline] neigh_resolve_output+0x113/0x470 net/core/neighbour.c:1470 neigh_output include/net/neighbour.h:511 [inline] ip6_finish_output2+0x7a2/0xec0 net/ipv6/ip6_output.c:116 __ip6_finish_output net/ipv6/ip6_output.c:142 [inline] __ip6_finish_output+0x2d7/0x330 net/ipv6/ip6_output.c:127 ip6_finish_output+0x41/0x160 net/ipv6/ip6_output.c:152 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip6_output+0xf2/0x280 net/ipv6/ip6_output.c:175 dst_output include/net/dst.h:436 [inline] NF_HOOK include/linux/netfilter.h:305 [inline] ndisc_send_skb+0x459/0x5f0 net/ipv6/ndisc.c:505 ndisc_send_ns+0x207/0x430 net/ipv6/ndisc.c:647 rt6_probe_deferred+0x98/0xf0 net/ipv6/route.c:615 process_one_work+0x3d4/0x890 kernel/workqueue.c:2269 worker_thread+0xa0/0x800 kernel/workqueue.c:2415 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 29572 Comm: kworker/1:4 Not tainted 5.4.0-rc6+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events rt6_probe_deferred Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/neighbour.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 50a67bd6a434..6a86e49181db 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -468,7 +468,7 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb do { seq = read_seqbegin(&hh->hh_lock); - hh_len = hh->hh_len; + hh_len = READ_ONCE(hh->hh_len); if (likely(hh_len <= HH_DATA_MOD)) { hh_alen = HH_DATA_MOD; -- cgit v1.2.3 From f8cc62ca3e660ae3fdaee533b1d554297cd2ae82 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 18:49:43 -0800 Subject: net: add a READ_ONCE() in skb_peek_tail() skb_peek_tail() can be used without protection of a lock, as spotted by KCSAN [1] In order to avoid load-stearing, add a READ_ONCE() Note that the corresponding WRITE_ONCE() are already there. [1] BUG: KCSAN: data-race in sk_wait_data / skb_queue_tail read to 0xffff8880b36a4118 of 8 bytes by task 20426 on cpu 1: skb_peek_tail include/linux/skbuff.h:1784 [inline] sk_wait_data+0x15b/0x250 net/core/sock.c:2477 kcm_wait_data+0x112/0x1f0 net/kcm/kcmsock.c:1103 kcm_recvmsg+0xac/0x320 net/kcm/kcmsock.c:1130 sock_recvmsg_nosec net/socket.c:871 [inline] sock_recvmsg net/socket.c:889 [inline] sock_recvmsg+0x92/0xb0 net/socket.c:885 ___sys_recvmsg+0x1a0/0x3e0 net/socket.c:2480 do_recvmmsg+0x19a/0x5c0 net/socket.c:2601 __sys_recvmmsg+0x1ef/0x200 net/socket.c:2680 __do_sys_recvmmsg net/socket.c:2703 [inline] __se_sys_recvmmsg net/socket.c:2696 [inline] __x64_sys_recvmmsg+0x89/0xb0 net/socket.c:2696 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 write to 0xffff8880b36a4118 of 8 bytes by task 451 on cpu 0: __skb_insert include/linux/skbuff.h:1852 [inline] __skb_queue_before include/linux/skbuff.h:1958 [inline] __skb_queue_tail include/linux/skbuff.h:1991 [inline] skb_queue_tail+0x7e/0xc0 net/core/skbuff.c:3145 kcm_queue_rcv_skb+0x202/0x310 net/kcm/kcmsock.c:206 kcm_rcv_strparser+0x74/0x4b0 net/kcm/kcmsock.c:370 __strp_recv+0x348/0xf50 net/strparser/strparser.c:309 strp_recv+0x84/0xa0 net/strparser/strparser.c:343 tcp_read_sock+0x174/0x5c0 net/ipv4/tcp.c:1639 strp_read_sock+0xd4/0x140 net/strparser/strparser.c:366 do_strp_work net/strparser/strparser.c:414 [inline] strp_work+0x9a/0xe0 net/strparser/strparser.c:423 process_one_work+0x3d4/0x890 kernel/workqueue.c:2269 worker_thread+0xa0/0x800 kernel/workqueue.c:2415 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 451 Comm: kworker/u4:3 Not tainted 5.4.0-rc3+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: kstrp strp_work Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 53238ac725a3..dfe02b658829 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1795,7 +1795,7 @@ static inline struct sk_buff *skb_peek_next(struct sk_buff *skb, */ static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_) { - struct sk_buff *skb = list_->prev; + struct sk_buff *skb = READ_ONCE(list_->prev); if (skb == (struct sk_buff *)list_) skb = NULL; @@ -1861,7 +1861,9 @@ static inline void __skb_insert(struct sk_buff *newsk, struct sk_buff *prev, struct sk_buff *next, struct sk_buff_head *list) { - /* see skb_queue_empty_lockless() for the opposite READ_ONCE() */ + /* See skb_queue_empty_lockless() and skb_peek_tail() + * for the opposite READ_ONCE() + */ WRITE_ONCE(newsk->next, next); WRITE_ONCE(newsk->prev, prev); WRITE_ONCE(next->prev, newsk); -- cgit v1.2.3 From 6912daed05e1370af5253aea6f2116805c0e57f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 23 Oct 2019 11:59:00 +0200 Subject: mac80211: Shrink the size of ack_frame_id to make room for tx_time_est MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To implement airtime queue limiting, we need to keep a running account of the estimated airtime of all skbs queued into the device. Do to this correctly, we need to store the airtime estimate into the skb so we can decrease the outstanding balance when the skb is freed. This means that the time estimate must be stored somewhere that will survive for the lifetime of the skb. To get this, decrease the size of the ack_frame_id field to 6 bits, and lower the size of the ID space accordingly. This leaves 10 bits for use for tx_time_est, which is enough to store a maximum of 4096 us, if we shift the values so they become units of 4us. Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/157182474063.150713.16132669599100802716.stgit@toke.dk Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index f5996960eace..c643a19dce96 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -967,6 +967,7 @@ ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate) * @band: the band to transmit on (use for checking for races) * @hw_queue: HW queue to put the frame on, skb_get_queue_mapping() gives the AC * @ack_frame_id: internal frame ID for TX status, used internally + * @tx_time_est: TX time estimate in units of 4us, used internally * @control: union part for control data * @control.rates: TX rates array to try * @control.rts_cts_rate_idx: rate for RTS or CTS @@ -1007,7 +1008,8 @@ struct ieee80211_tx_info { u8 hw_queue; - u16 ack_frame_id; + u16 ack_frame_id:6; + u16 tx_time_est:10; union { struct { -- cgit v1.2.3 From 14f34e36b36ceede9877ca422a62fcac17b52023 Mon Sep 17 00:00:00 2001 From: Gurumoorthi Gnanasambandhan Date: Thu, 31 Oct 2019 23:46:40 +0200 Subject: cfg80211: VLAN offload support for set_key and set_sta_vlan This provides an alternative mechanism for AP VLAN support where a single netdev is used with VLAN tagged frames instead of separate netdevs for each VLAN without tagged frames from the WLAN driver. By setting NL80211_EXT_FEATURE_VLAN_OFFLOAD flag the driver indicates support for a single netdev with VLAN tagged frames. Separate VLAN-specific netdevs can be added using RTM_NEWLINK/IFLA_VLAN_ID similarly to Ethernet. NL80211_CMD_NEW_KEY (for group keys), NL80211_CMD_NEW_STATION, and NL80211_CMD_SET_STATION will optionally specify vlan_id using NL80211_ATTR_VLAN_ID. Signed-off-by: Gurumoorthi Gnanasambandhan Signed-off-by: Jouni Malinen Link: https://lore.kernel.org/r/20191031214640.5012-1-jouni@codeaurora.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++++ include/uapi/linux/nl80211.h | 26 ++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 4ab2c49423dc..e309cc826b40 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -565,6 +565,7 @@ struct vif_params { * with the get_key() callback, must be in little endian, * length given by @seq_len. * @seq_len: length of @seq. + * @vlan_id: vlan_id for VLAN group key (if nonzero) * @mode: key install mode (RX_TX, NO_TX or SET_TX) */ struct key_params { @@ -572,6 +573,7 @@ struct key_params { const u8 *seq; int key_len; int seq_len; + u16 vlan_id; u32 cipher; enum nl80211_key_mode mode; }; @@ -1124,6 +1126,7 @@ struct sta_txpwr { * (bitmask of BIT(%NL80211_STA_FLAG_...)) * @listen_interval: listen interval or -1 for no change * @aid: AID or zero for no change + * @vlan_id: VLAN ID for station (if nonzero) * @peer_aid: mesh peer AID or zero for no change * @plink_action: plink action to take * @plink_state: set the peer link state for a station @@ -1159,6 +1162,7 @@ struct station_parameters { u32 sta_modify_mask; int listen_interval; u16 aid; + u16 vlan_id; u16 peer_aid; u8 supported_rates_len; u8 plink_action; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 64135ab3a7ac..341e0e8cae46 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -248,6 +248,22 @@ * %NL80211_ATTR_SAE_PASSWORD. */ +/** + * DOC: VLAN offload support for setting group keys and binding STAs to VLANs + * + * By setting @NL80211_EXT_FEATURE_VLAN_OFFLOAD flag drivers can indicate they + * support offloading VLAN functionality in a manner where the driver exposes a + * single netdev that uses VLAN tagged frames and separate VLAN-specific netdevs + * can then be added using RTM_NEWLINK/IFLA_VLAN_ID similarly to the Ethernet + * case. Frames received from stations that are not assigned to any VLAN are + * delivered on the main netdev and frames to such stations can be sent through + * that main netdev. + * + * %NL80211_CMD_NEW_KEY (for group keys), %NL80211_CMD_NEW_STATION, and + * %NL80211_CMD_SET_STATION will optionally specify vlan_id using + * %NL80211_ATTR_VLAN_ID. + */ + /** * enum nl80211_commands - supported nl80211 commands * @@ -2381,6 +2397,9 @@ enum nl80211_commands { * the allowed channel bandwidth configurations. (u8 attribute) * Defined by IEEE P802.11ay/D4.0 section 9.4.2.251, Table 13. * + * @NL80211_ATTR_VLAN_ID: VLAN ID (1..4094) for the station and VLAN group key + * (u16). + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2843,6 +2862,8 @@ enum nl80211_attrs { NL80211_ATTR_WIPHY_EDMG_CHANNELS, NL80211_ATTR_WIPHY_EDMG_BW_CONFIG, + NL80211_ATTR_VLAN_ID, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5492,6 +5513,10 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_SAE_OFFLOAD: Device wants to do SAE authentication in * station mode (SAE password is passed as part of the connect command). * + * @NL80211_EXT_FEATURE_VLAN_OFFLOAD: The driver supports a single netdev + * with VLAN tagged frames and separate VLAN-specific netdevs added using + * vconfig similarly to the Ethernet case. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5537,6 +5562,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_EXT_KEY_ID, NL80211_EXT_FEATURE_STA_TX_PWR, NL80211_EXT_FEATURE_SAE_OFFLOAD, + NL80211_EXT_FEATURE_VLAN_OFFLOAD, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From 90b2be27bb0e56483f335cc10fb59ec66882b949 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Nov 2019 08:45:23 -0800 Subject: net/sched: annotate lockless accesses to qdisc->empty KCSAN reported the following race [1] BUG: KCSAN: data-race in __dev_queue_xmit / net_tx_action read to 0xffff8880ba403508 of 1 bytes by task 21814 on cpu 1: __dev_xmit_skb net/core/dev.c:3389 [inline] __dev_queue_xmit+0x9db/0x1b40 net/core/dev.c:3761 dev_queue_xmit+0x21/0x30 net/core/dev.c:3825 neigh_hh_output include/net/neighbour.h:500 [inline] neigh_output include/net/neighbour.h:509 [inline] ip6_finish_output2+0x873/0xec0 net/ipv6/ip6_output.c:116 __ip6_finish_output net/ipv6/ip6_output.c:142 [inline] __ip6_finish_output+0x2d7/0x330 net/ipv6/ip6_output.c:127 ip6_finish_output+0x41/0x160 net/ipv6/ip6_output.c:152 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip6_output+0xf2/0x280 net/ipv6/ip6_output.c:175 dst_output include/net/dst.h:436 [inline] ip6_local_out+0x74/0x90 net/ipv6/output_core.c:179 ip6_send_skb+0x53/0x110 net/ipv6/ip6_output.c:1795 udp_v6_send_skb.isra.0+0x3ec/0xa70 net/ipv6/udp.c:1173 udpv6_sendmsg+0x1906/0x1c20 net/ipv6/udp.c:1471 inet6_sendmsg+0x6d/0x90 net/ipv6/af_inet6.c:576 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0x9f/0xc0 net/socket.c:657 ___sys_sendmsg+0x2b7/0x5d0 net/socket.c:2311 __sys_sendmmsg+0x123/0x350 net/socket.c:2413 __do_sys_sendmmsg net/socket.c:2442 [inline] __se_sys_sendmmsg net/socket.c:2439 [inline] __x64_sys_sendmmsg+0x64/0x80 net/socket.c:2439 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 write to 0xffff8880ba403508 of 1 bytes by interrupt on cpu 0: qdisc_run_begin include/net/sch_generic.h:160 [inline] qdisc_run include/net/pkt_sched.h:120 [inline] net_tx_action+0x2b1/0x6c0 net/core/dev.c:4551 __do_softirq+0x115/0x33f kernel/softirq.c:292 do_softirq_own_stack+0x2a/0x40 arch/x86/entry/entry_64.S:1082 do_softirq.part.0+0x6b/0x80 kernel/softirq.c:337 do_softirq kernel/softirq.c:329 [inline] __local_bh_enable_ip+0x76/0x80 kernel/softirq.c:189 local_bh_enable include/linux/bottom_half.h:32 [inline] rcu_read_unlock_bh include/linux/rcupdate.h:688 [inline] ip6_finish_output2+0x7bb/0xec0 net/ipv6/ip6_output.c:117 __ip6_finish_output net/ipv6/ip6_output.c:142 [inline] __ip6_finish_output+0x2d7/0x330 net/ipv6/ip6_output.c:127 ip6_finish_output+0x41/0x160 net/ipv6/ip6_output.c:152 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip6_output+0xf2/0x280 net/ipv6/ip6_output.c:175 dst_output include/net/dst.h:436 [inline] ip6_local_out+0x74/0x90 net/ipv6/output_core.c:179 ip6_send_skb+0x53/0x110 net/ipv6/ip6_output.c:1795 udp_v6_send_skb.isra.0+0x3ec/0xa70 net/ipv6/udp.c:1173 udpv6_sendmsg+0x1906/0x1c20 net/ipv6/udp.c:1471 inet6_sendmsg+0x6d/0x90 net/ipv6/af_inet6.c:576 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0x9f/0xc0 net/socket.c:657 ___sys_sendmsg+0x2b7/0x5d0 net/socket.c:2311 __sys_sendmmsg+0x123/0x350 net/socket.c:2413 __do_sys_sendmmsg net/socket.c:2442 [inline] __se_sys_sendmmsg net/socket.c:2439 [inline] __x64_sys_sendmmsg+0x64/0x80 net/socket.c:2439 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 21817 Comm: syz-executor.2 Not tainted 5.4.0-rc6+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: d518d2ed8640 ("net/sched: fix race between deactivation and dequeue for NOLOCK qdisc") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Paolo Abeni Cc: Davide Caratti Signed-off-by: David S. Miller --- include/net/sch_generic.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index a8b0a9a4c686..d43da37737be 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -148,8 +148,8 @@ static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) static inline bool qdisc_is_empty(const struct Qdisc *qdisc) { if (qdisc_is_percpu_stats(qdisc)) - return qdisc->empty; - return !qdisc->q.qlen; + return READ_ONCE(qdisc->empty); + return !READ_ONCE(qdisc->q.qlen); } static inline bool qdisc_run_begin(struct Qdisc *qdisc) @@ -157,7 +157,7 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) if (qdisc->flags & TCQ_F_NOLOCK) { if (!spin_trylock(&qdisc->seqlock)) return false; - qdisc->empty = false; + WRITE_ONCE(qdisc->empty, false); } else if (qdisc_is_running(qdisc)) { return false; } -- cgit v1.2.3 From 134bdac397661a5841d9f27f508190c68b26232b Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Fri, 8 Nov 2019 12:05:10 +0700 Subject: tipc: add new AEAD key structure for user API The new structure 'tipc_aead_key' is added to the 'tipc.h' for user to be able to transfer a key to TIPC in kernel. Netlink will be used for this purpose in the later commits. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 76421b878767..add01db1daef 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -233,6 +233,27 @@ struct tipc_sioc_nodeid_req { char node_id[TIPC_NODEID_LEN]; }; +/* + * TIPC Crypto, AEAD + */ +#define TIPC_AEAD_ALG_NAME (32) + +struct tipc_aead_key { + char alg_name[TIPC_AEAD_ALG_NAME]; + unsigned int keylen; /* in bytes */ + char key[]; +}; + +#define TIPC_AEAD_KEYLEN_MIN (16 + 4) +#define TIPC_AEAD_KEYLEN_MAX (32 + 4) +#define TIPC_AEAD_KEY_SIZE_MAX (sizeof(struct tipc_aead_key) + \ + TIPC_AEAD_KEYLEN_MAX) + +static inline int tipc_aead_key_size(struct tipc_aead_key *key) +{ + return sizeof(*key) + key->keylen; +} + /* The macros and functions below are deprecated: */ -- cgit v1.2.3 From e1f32190cf7ddd55778b460e7d44af3f76529698 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Fri, 8 Nov 2019 12:05:12 +0700 Subject: tipc: add support for AEAD key setting via netlink This commit adds two netlink commands to TIPC in order for user to be able to set or remove AEAD keys: - TIPC_NL_KEY_SET - TIPC_NL_KEY_FLUSH When the 'KEY_SET' is given along with the key data, the key will be initiated and attached to TIPC crypto. On the other hand, the 'KEY_FLUSH' command will remove all existing keys if any. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index efb958fd167d..6c2194ab745b 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -63,6 +63,8 @@ enum { TIPC_NL_PEER_REMOVE, TIPC_NL_BEARER_ADD, TIPC_NL_UDP_GET_REMOTEIP, + TIPC_NL_KEY_SET, + TIPC_NL_KEY_FLUSH, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -160,6 +162,8 @@ enum { TIPC_NLA_NODE_UNSPEC, TIPC_NLA_NODE_ADDR, /* u32 */ TIPC_NLA_NODE_UP, /* flag */ + TIPC_NLA_NODE_ID, /* data */ + TIPC_NLA_NODE_KEY, /* data */ __TIPC_NLA_NODE_MAX, TIPC_NLA_NODE_MAX = __TIPC_NLA_NODE_MAX - 1 -- cgit v1.2.3 From a0c76345e3d3dbc40c39de2e00d15a3b7eef7885 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 8 Nov 2019 21:42:43 +0100 Subject: devlink: disallow reload operation during device cleanup There is a race between driver code that does setup/cleanup of device and devlink reload operation that in some drivers works with the same code. Use after free could we easily obtained by running: while true; do echo 10 > /sys/bus/netdevsim/new_device devlink dev reload netdevsim/netdevsim10 & echo 10 > /sys/bus/netdevsim/del_device done Fix this by enabling reload only after setup of device is complete and disabling it at the beginning of the cleanup process. Reported-by: Ido Schimmel Fixes: 2d8dc5bbf4e7 ("devlink: Add support for reload") Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 8d6b5846822c..7891611868e4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -38,8 +38,9 @@ struct devlink { struct device *dev; possible_net_t _net; struct mutex lock; - bool reload_failed; - bool registered; + u8 reload_failed:1, + reload_enabled:1, + registered:1; char priv[0] __aligned(NETDEV_ALIGN); }; @@ -824,6 +825,8 @@ void devlink_net_set(struct devlink *devlink, struct net *net); struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size); int devlink_register(struct devlink *devlink, struct device *dev); void devlink_unregister(struct devlink *devlink); +void devlink_reload_enable(struct devlink *devlink); +void devlink_reload_disable(struct devlink *devlink); void devlink_free(struct devlink *devlink); int devlink_port_register(struct devlink *devlink, struct devlink_port *devlink_port, -- cgit v1.2.3 From aef587be42925f92418083f08852d0011b2766ca Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Nov 2019 13:20:32 +0800 Subject: sctp: add pf_expose per netns and sock and asoc As said in rfc7829, section 3, point 12: The SCTP stack SHOULD expose the PF state of its destination addresses to the ULP as well as provide the means to notify the ULP of state transitions of its destination addresses from active to PF, and vice versa. However, it is recommended that an SCTP stack implementing SCTP-PF also allows for the ULP to be kept ignorant of the PF state of its destinations and the associated state transitions, thus allowing for retention of the simpler state transition model of [RFC4960] in the ULP. Not only does it allow to expose the PF state to ULP, but also allow to ignore sctp-pf to ULP. So this patch is to add pf_expose per netns, sock and asoc. And in sctp_assoc_control_transport(), ulp_notify will be set to false if asoc->expose is not 'enabled' in next patch. It also allows a user to change pf_expose per netns by sysctl, and pf_expose per sock and asoc will be initialized with it. Note that pf_expose also works for SCTP_GET_PEER_ADDR_INFO sockopt, to not allow a user to query the state of a sctp-pf peer address when pf_expose is 'disabled', as said in section 7.3. v1->v2: - Fix a build warning noticed by Nathan Chancellor. v2->v3: - set pf_expose to UNUSED by default to keep compatible with old applications. v3->v4: - add a new entry for pf_expose on ip-sysctl.txt, as Marcelo suggested. - change this patch to 1/5, and move sctp_assoc_control_transport change into 2/5, as Marcelo suggested. - use SCTP_PF_EXPOSE_UNSET instead of SCTP_PF_EXPOSE_UNUSED, and set SCTP_PF_EXPOSE_UNSET to 0 in enum, as Marcelo suggested. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/netns/sctp.h | 8 ++++++++ include/net/sctp/constants.h | 10 ++++++++++ include/net/sctp/structs.h | 2 ++ include/uapi/linux/sctp.h | 1 + 4 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index bdc0f27b8514..18c3ddae77a3 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -96,6 +96,14 @@ struct netns_sctp { */ int pf_enable; + /* + * Disable Potentially-Failed state exposure, ignored by default + * pf_expose - 0 : compatible with old applications (by default) + * - 1 : disable pf state exposure + * - 2 : enable pf state exposure + */ + int pf_expose; + /* * Policy for preforming sctp/socket accounting * 0 - do socket level accounting, all assocs share sk_sndbuf diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 823afc42a3aa..e88b77a34cb1 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -286,6 +286,16 @@ enum { SCTP_MAX_GABS = 16 }; * functions simpler to write. */ +/* These are the values for pf exposure, UNUSED is to keep compatible with old + * applications by default. + */ +enum { + SCTP_PF_EXPOSE_UNSET, + SCTP_PF_EXPOSE_DISABLE, + SCTP_PF_EXPOSE_ENABLE, +}; +#define SCTP_PF_EXPOSE_MAX SCTP_PF_EXPOSE_ENABLE + /* These return values describe the success or failure of a number of * routines which form the lower interface to SCTP_outqueue. */ diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 503fbc3cd819..9a43738774d7 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -215,6 +215,7 @@ struct sctp_sock { __u32 adaptation_ind; __u32 pd_point; __u16 nodelay:1, + pf_expose:2, reuse:1, disable_fragments:1, v4mapped:1, @@ -2053,6 +2054,7 @@ struct sctp_association { __u8 need_ecne:1, /* Need to send an ECNE Chunk? */ temp:1, /* Is it a temporary association? */ + pf_expose:2, /* Expose pf state? */ force_delay:1; __u8 strreset_enable; diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 6bce7f9837a9..765f41a080b4 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -933,6 +933,7 @@ struct sctp_paddrinfo { enum sctp_spinfo_state { SCTP_INACTIVE, SCTP_PF, +#define SCTP_POTENTIALLY_FAILED SCTP_PF SCTP_ACTIVE, SCTP_UNCONFIRMED, SCTP_UNKNOWN = 0xffff /* Value used for transport state unknown */ -- cgit v1.2.3 From 768e15182dcb809e39c338290dda10c4e271d133 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Nov 2019 13:20:33 +0800 Subject: sctp: add SCTP_ADDR_POTENTIALLY_FAILED notification SCTP Quick failover draft section 5.1, point 5 has been removed from rfc7829. Instead, "the sender SHOULD (i) notify the Upper Layer Protocol (ULP) about this state transition", as said in section 3.2, point 8. So this patch is to add SCTP_ADDR_POTENTIALLY_FAILED, defined in section 7.1, "which is reported if the affected address becomes PF". Also remove transport cwnd's update when moving from PF back to ACTIVE , which is no longer in rfc7829 either. Note that ulp_notify will be set to false if asoc->expose is not 'enabled', according to last patch. v2->v3: - define SCTP_ADDR_PF SCTP_ADDR_POTENTIALLY_FAILED. v3->v4: - initialize spc_state with SCTP_ADDR_AVAILABLE, as Marcelo suggested. - check asoc->pf_expose in sctp_assoc_control_transport(), as Marcelo suggested. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 765f41a080b4..d99b428ac34e 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -410,6 +410,8 @@ enum sctp_spc_state { SCTP_ADDR_ADDED, SCTP_ADDR_MADE_PRIM, SCTP_ADDR_CONFIRMED, + SCTP_ADDR_POTENTIALLY_FAILED, +#define SCTP_ADDR_PF SCTP_ADDR_POTENTIALLY_FAILED }; -- cgit v1.2.3 From 8d2a6935d842f12c25611b165eace778adb09a53 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Nov 2019 13:20:34 +0800 Subject: sctp: add SCTP_EXPOSE_POTENTIALLY_FAILED_STATE sockopt This is a sockopt defined in section 7.3 of rfc7829: "Exposing the Potentially Failed Path State", by which users can change pf_expose per sock and asoc. The new sockopt SCTP_EXPOSE_POTENTIALLY_FAILED_STATE is also known as SCTP_EXPOSE_PF_STATE for short. v2->v3: - return -EINVAL if params.assoc_value > SCTP_PF_EXPOSE_MAX. - define SCTP_EXPOSE_PF_STATE SCTP_EXPOSE_POTENTIALLY_FAILED_STATE. v3->v4: - improve changelog. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index d99b428ac34e..a190e4a7f546 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -137,6 +137,8 @@ typedef __s32 sctp_assoc_t; #define SCTP_ASCONF_SUPPORTED 128 #define SCTP_AUTH_SUPPORTED 129 #define SCTP_ECN_SUPPORTED 130 +#define SCTP_EXPOSE_POTENTIALLY_FAILED_STATE 131 +#define SCTP_EXPOSE_PF_STATE SCTP_EXPOSE_POTENTIALLY_FAILED_STATE /* PR-SCTP policies */ #define SCTP_PR_SCTP_NONE 0x0000 -- cgit v1.2.3 From 34515e94c92c3f593cd696abca8609246cbd75e6 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Nov 2019 13:20:35 +0800 Subject: sctp: add support for Primary Path Switchover This is a new feature defined in section 5 of rfc7829: "Primary Path Switchover". By introducing a new tunable parameter: Primary.Switchover.Max.Retrans (PSMR) The primary path will be changed to another active path when the path error counter on the old primary path exceeds PSMR, so that "the SCTP sender is allowed to continue data transmission on a new working path even when the old primary destination address becomes active again". This patch is to add this tunable parameter, 'ps_retrans' per netns, sock, asoc and transport. It also allows a user to change ps_retrans per netns by sysctl, and ps_retrans per sock/asoc/transport will be initialized with it. The check will be done in sctp_do_8_2_transport_strike() when this feature is enabled. Note this feature is disabled by initializing 'ps_retrans' per netns as 0xffff by default, and its value can't be less than 'pf_retrans' when changing by sysctl. v3->v4: - add define SCTP_PS_RETRANS_MAX 0xffff, and use it on extra2 of sysctl 'ps_retrans'. - add a new entry for ps_retrans on ip-sysctl.txt. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/netns/sctp.h | 6 ++++++ include/net/sctp/constants.h | 2 ++ include/net/sctp/structs.h | 11 ++++++++--- 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index 18c3ddae77a3..d8d02e4188d1 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -89,6 +89,12 @@ struct netns_sctp { */ int pf_retrans; + /* Primary.Switchover.Max.Retrans sysctl value + * taken from: + * https://tools.ietf.org/html/rfc7829 + */ + int ps_retrans; + /* * Disable Potentially-Failed feature, the feature is enabled by default * pf_enable - 0 : disable pf diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index e88b77a34cb1..15b4d9aec7ff 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -296,6 +296,8 @@ enum { }; #define SCTP_PF_EXPOSE_MAX SCTP_PF_EXPOSE_ENABLE +#define SCTP_PS_RETRANS_MAX 0xffff + /* These return values describe the success or failure of a number of * routines which form the lower interface to SCTP_outqueue. */ diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 9a43738774d7..3cc913f328cd 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -184,7 +184,8 @@ struct sctp_sock { __u32 flowlabel; __u8 dscp; - int pf_retrans; + __u16 pf_retrans; + __u16 ps_retrans; /* The initial Path MTU to use for new associations. */ __u32 pathmtu; @@ -897,7 +898,9 @@ struct sctp_transport { * and will be initialized from the assocs value. This can be changed * using the SCTP_PEER_ADDR_THLDS socket option */ - int pf_retrans; + __u16 pf_retrans; + /* Used for primary path switchover. */ + __u16 ps_retrans; /* PMTU : The current known path MTU. */ __u32 pathmtu; @@ -1773,7 +1776,9 @@ struct sctp_association { * and will be initialized from the assocs value. This can be * changed using the SCTP_PEER_ADDR_THLDS socket option */ - int pf_retrans; + __u16 pf_retrans; + /* Used for primary path switchover. */ + __u16 ps_retrans; /* Maximum number of times the endpoint will retransmit INIT */ __u16 max_init_attempts; -- cgit v1.2.3 From d467ac0a38551a5904878b1f5a2fe20a040c0e11 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Nov 2019 13:20:36 +0800 Subject: sctp: add SCTP_PEER_ADDR_THLDS_V2 sockopt Section 7.2 of rfc7829: "Peer Address Thresholds (SCTP_PEER_ADDR_THLDS) Socket Option" extends 'struct sctp_paddrthlds' with 'spt_pathcpthld' added to allow a user to change ps_retrans per sock/asoc/transport, as other 2 paddrthlds: pf_retrans, pathmaxrxt. Note: to not break the user's program, here to support pf_retrans dump and setting by adding a new sockopt SCTP_PEER_ADDR_THLDS_V2, and a new structure sctp_paddrthlds_v2 instead of extending sctp_paddrthlds. Also, when setting ps_retrans, the value is not allowed to be greater than pf_retrans. v1->v2: - use SCTP_PEER_ADDR_THLDS_V2 to set/get pf_retrans instead, as Marcelo and David Laight suggested. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index a190e4a7f546..28ad40d9acba 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -105,6 +105,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_DEFAULT_SNDINFO 34 #define SCTP_AUTH_DEACTIVATE_KEY 35 #define SCTP_REUSE_PORT 36 +#define SCTP_PEER_ADDR_THLDS_V2 37 /* Internal Socket Options. Some of the sctp library functions are * implemented using these socket options. @@ -1087,6 +1088,15 @@ struct sctp_paddrthlds { __u16 spt_pathpfthld; }; +/* Use a new structure with spt_pathcpthld for back compatibility */ +struct sctp_paddrthlds_v2 { + sctp_assoc_t spt_assoc_id; + struct sockaddr_storage spt_address; + __u16 spt_pathmaxrxt; + __u16 spt_pathpfthld; + __u16 spt_pathcpthld; +}; + /* * Socket Option for Getting the Association/Stream-Specific PR-SCTP Status */ -- cgit v1.2.3 From 727b3668b730634228fc65c336c2a7a080e02885 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 8 Nov 2019 17:39:29 +0000 Subject: net: sfp: rework upstream interface The current upstream interface is an all-or-nothing, which is sub-optimal for future changes, as it doesn't allow the upstream driver to prepare for the SFP module becoming available, as it is at boot. Switch to a find-sfp-bus, add-upstream, del-upstream, put-sfp-bus interface structure instead, which allows the upstream driver to prepare for a module being available as soon as add-upstream is called. Signed-off-by: Russell King Signed-off-by: David S. Miller --- include/linux/sfp.h | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 355a08a76fd4..c8464de7cff5 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -508,10 +508,11 @@ int sfp_get_module_eeprom(struct sfp_bus *bus, struct ethtool_eeprom *ee, u8 *data); void sfp_upstream_start(struct sfp_bus *bus); void sfp_upstream_stop(struct sfp_bus *bus); -struct sfp_bus *sfp_register_upstream_node(struct fwnode_handle *fwnode, - void *upstream, - const struct sfp_upstream_ops *ops); -void sfp_unregister_upstream(struct sfp_bus *bus); +void sfp_bus_put(struct sfp_bus *bus); +struct sfp_bus *sfp_bus_find_fwnode(struct fwnode_handle *fwnode); +int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, + const struct sfp_upstream_ops *ops); +void sfp_bus_del_upstream(struct sfp_bus *bus); #else static inline int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, @@ -553,14 +554,22 @@ static inline void sfp_upstream_stop(struct sfp_bus *bus) { } -static inline struct sfp_bus *sfp_register_upstream_node( - struct fwnode_handle *fwnode, void *upstream, - const struct sfp_upstream_ops *ops) +static inline void sfp_bus_put(struct sfp_bus *bus) +{ +} + +static inline struct sfp_bus *sfp_bus_find_fwnode(struct fwnode_handle *fwnode) { return NULL; } -static inline void sfp_unregister_upstream(struct sfp_bus *bus) +static int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, + const struct sfp_upstream_ops *ops) +{ + return 0; +} + +static inline void sfp_bus_del_upstream(struct sfp_bus *bus) { } #endif -- cgit v1.2.3 From 6c7295e13ffd5623b02f1adc1442f1d8a3d52424 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Fri, 8 Nov 2019 23:45:20 +0000 Subject: devlink: Add new "enable_roce" generic device param New device parameter to enable/disable handling of RoCE traffic in the device. Signed-off-by: Michael Guralnik Acked-by: Jiri Pirko Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- include/net/devlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 23e4b65ec9df..39fb4d957838 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -400,6 +400,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY, DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE, + DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -434,6 +435,9 @@ enum devlink_param_generic_id { "reset_dev_on_drv_probe" #define DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE DEVLINK_PARAM_TYPE_U8 +#define DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME "enable_roce" +#define DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE DEVLINK_PARAM_TYPE_BOOL + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ -- cgit v1.2.3 From cc9defcbb8fae52810f7795b039223edae51ef95 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Fri, 8 Nov 2019 23:45:24 +0000 Subject: net/mlx5: Handle "enable_roce" devlink param Register "enable_roce" param, default value is RoCE enabled. Current configuration is stored on mlx5_core_dev and exposed to user through the cmode runtime devlink param. Changing configuration requires changing the cmode driverinit devlink param and calling devlink reload. Signed-off-by: Michael Guralnik Acked-by: Jiri Pirko Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 7b4801e96feb..1884513aac90 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1191,4 +1191,15 @@ enum { MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32, }; +static inline bool mlx5_is_roce_enabled(struct mlx5_core_dev *dev) +{ + struct devlink *devlink = priv_to_devlink(dev); + union devlink_param_value val; + + devlink_param_driverinit_value_get(devlink, + DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, + &val); + return val.vbool; +} + #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From 50ec88120ea16cf8b9aabf8422c364166ce3ee17 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 8 Oct 2019 19:20:39 +0300 Subject: can: mcp251x: get rid of legacy platform data Instead of using legacy platform data, switch to use device properties. For clock frequency we are using well established clock-frequency property. Users, two for now, are also converted here. Cc: Daniel Mack Cc: Haojian Zhuang Cc: Robert Jarzmik Cc: Russell King Signed-off-by: Andy Shevchenko Signed-off-by: Marc Kleine-Budde --- include/linux/can/platform/mcp251x.h | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 include/linux/can/platform/mcp251x.h (limited to 'include') diff --git a/include/linux/can/platform/mcp251x.h b/include/linux/can/platform/mcp251x.h deleted file mode 100644 index 9e5ac27fb6c1..000000000000 --- a/include/linux/can/platform/mcp251x.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _CAN_PLATFORM_MCP251X_H -#define _CAN_PLATFORM_MCP251X_H - -/* - * - * CAN bus driver for Microchip 251x CAN Controller with SPI Interface - * - */ - -#include - -/* - * struct mcp251x_platform_data - MCP251X SPI CAN controller platform data - * @oscillator_frequency: - oscillator frequency in Hz - */ - -struct mcp251x_platform_data { - unsigned long oscillator_frequency; -}; - -#endif /* !_CAN_PLATFORM_MCP251X_H */ -- cgit v1.2.3 From 61d2350615c2c42f7af65d9a575f5dbf9738a10e Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 7 Oct 2019 13:36:58 +0200 Subject: can: rx-offload: can_rx_offload_reset(): remove no-op function This patch removes the function can_rx_offload_reset(), as it does nothing. If we ever need this function, add it back again. Signed-off-by: Marc Kleine-Budde --- include/linux/can/rx-offload.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h index 01219f2902bf..fc75e9a7ad2f 100644 --- a/include/linux/can/rx-offload.h +++ b/include/linux/can/rx-offload.h @@ -44,7 +44,6 @@ unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload, unsigned int idx, u32 timestamp); int can_rx_offload_queue_tail(struct can_rx_offload *offload, struct sk_buff *skb); -void can_rx_offload_reset(struct can_rx_offload *offload); void can_rx_offload_del(struct can_rx_offload *offload); void can_rx_offload_enable(struct can_rx_offload *offload); -- cgit v1.2.3 From 4e9c9484b085dbba60b299182dd490eaeb84d18a Mon Sep 17 00:00:00 2001 From: Joakim Zhang Date: Fri, 12 Jul 2019 08:02:38 +0000 Subject: can: rx-offload: Prepare for CAN FD support The skbs for classic CAN and CAN FD frames are allocated with seperate functions: alloc_can_skb() and alloc_canfd_skb(). In order to support CAN FD frames via the rx-offload helper, the driver itself has to allocate the skb (depending whether it received a classic CAN or CAN FD frame), as the rx-offload helper cannot know which kind of CAN frame the driver has received. This patch moves the allocation of the skb into the struct can_rx_offload::mailbox_read callbacks of the the flexcan and ti_hecc driver and adjusts the rx-offload helper accordingly. Signed-off-by: Joakim Zhang Signed-off-by: Marc Kleine-Budde --- include/linux/can/rx-offload.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h index fc75e9a7ad2f..1b78a0cfb615 100644 --- a/include/linux/can/rx-offload.h +++ b/include/linux/can/rx-offload.h @@ -15,9 +15,9 @@ struct can_rx_offload { struct net_device *dev; - unsigned int (*mailbox_read)(struct can_rx_offload *offload, - struct can_frame *cf, - u32 *timestamp, unsigned int mb); + struct sk_buff *(*mailbox_read)(struct can_rx_offload *offload, + unsigned int mb, u32 *timestamp, + bool drop); struct sk_buff_head skb_queue; u32 skb_queue_len_max; -- cgit v1.2.3 From 6c0867022352027409f5a9fee1d3c6923f9e083e Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 12 Nov 2019 11:35:00 +0000 Subject: net: sfp: fix sfp_bus_add_upstream() warning When building with SFP disabled, the stub for sfp_bus_add_upstream() missed "inline". Add it. Fixes: 727b3668b730 ("net: sfp: rework upstream interface") Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/sfp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index c8464de7cff5..3b35efd85bb1 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -563,8 +563,8 @@ static inline struct sfp_bus *sfp_bus_find_fwnode(struct fwnode_handle *fwnode) return NULL; } -static int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, - const struct sfp_upstream_ops *ops) +static inline int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, + const struct sfp_upstream_ops *ops) { return 0; } -- cgit v1.2.3 From e2cde864a1d3e3626bfc8fa088fbc82b04ce66ed Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Tue, 12 Nov 2019 14:07:49 +0200 Subject: devlink: Allow large formatted message of binary output Devlink supports pair output of name and value. When the value is binary, it must be presented in an array. If the length of the binary value exceeds fmsg limitation, break the value into chunks internally. Signed-off-by: Aya Levin Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 7891611868e4..7e72b2e71164 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -967,8 +967,6 @@ int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value); int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value); int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value); int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value); -int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, - u16 value_len); int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, bool value); @@ -981,7 +979,7 @@ int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, const char *value); int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, - const void *value, u16 value_len); + const void *value, u32 value_len); struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, -- cgit v1.2.3 From e0e2b35b790fefbcff5689984a134cdaa4ce051c Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 12 Nov 2019 15:33:11 +0100 Subject: net/sched: actions: remove unused 'order' after commit 4097e9d250fb ("net: sched: don't use tc_action->order during action dump"), 'act->order' is initialized but then it's no more read, so we can just remove this member of struct tc_action. CC: Ivan Vecera Signed-off-by: Davide Caratti Acked-by: Jiri Pirko Reviewed-by: Ivan Vecera Signed-off-by: David S. Miller --- include/net/act_api.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 0495bdc034d2..71347a90a9d1 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -23,7 +23,6 @@ struct tc_action_ops; struct tc_action { const struct tc_action_ops *ops; __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ - __u32 order; struct tcf_idrinfo *idrinfo; u32 tcfa_index; -- cgit v1.2.3 From b32d2f341623765f525b1a559aa1758599ed7094 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Nov 2019 00:29:51 +0100 Subject: netfilter: nf_flow_table: move conntrack object to struct flow_offload Simplify this code by storing the pointer to conntrack object in the flow_offload structure. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_flow_table.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 158514281a75..88c8cd248213 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -72,6 +72,7 @@ struct flow_offload_tuple_rhash { struct flow_offload { struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX]; + struct nf_conn *ct; u32 flags; union { /* Your private driver data here. */ -- cgit v1.2.3 From 9f48e9bf253aa292dbf10f173f6f4c02d0349f45 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Nov 2019 00:29:52 +0100 Subject: netfilter: nf_flow_table: remove union from flow_offload structure Drivers do not have access to the flow_offload structure, hence remove this union from this flow_offload object as well as the original comment on top of it. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_flow_table.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 88c8cd248213..7f892d6c1a6d 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -74,10 +74,7 @@ struct flow_offload { struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX]; struct nf_conn *ct; u32 flags; - union { - /* Your private driver data here. */ - u32 timeout; - }; + u32 timeout; }; #define NF_FLOW_TIMEOUT (30 * HZ) -- cgit v1.2.3 From 62248df88a406a443b838a3633a7f60a716f999e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Nov 2019 00:29:53 +0100 Subject: netfilter: nf_flowtable: remove flow_offload_entry structure Move rcu_head to struct flow_offload, then remove the flow_offload_entry structure definition. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_flow_table.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 7f892d6c1a6d..6d33734c8fa1 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -75,6 +75,7 @@ struct flow_offload { struct nf_conn *ct; u32 flags; u32 timeout; + struct rcu_head rcu_head; }; #define NF_FLOW_TIMEOUT (30 * HZ) -- cgit v1.2.3 From f1363e058b84e61d39f9796fa806090ad7a28ebd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Nov 2019 00:29:54 +0100 Subject: netfilter: nf_flow_table: detach routing information from flow description This patch adds the infrastructure to support for flow entry types. The initial type is NF_FLOW_OFFLOAD_ROUTE that stores the routing information into the flow entry to define a fastpath for the classic forwarding path. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_flow_table.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 6d33734c8fa1..f000e8917487 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -70,10 +70,16 @@ struct flow_offload_tuple_rhash { #define FLOW_OFFLOAD_DYING 0x4 #define FLOW_OFFLOAD_TEARDOWN 0x8 +enum flow_offload_type { + NF_FLOW_OFFLOAD_UNSPEC = 0, + NF_FLOW_OFFLOAD_ROUTE, +}; + struct flow_offload { struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX]; struct nf_conn *ct; - u32 flags; + u16 flags; + u16 type; u32 timeout; struct rcu_head rcu_head; }; @@ -86,10 +92,12 @@ struct nf_flow_route { } tuple[FLOW_OFFLOAD_DIR_MAX]; }; -struct flow_offload *flow_offload_alloc(struct nf_conn *ct, - struct nf_flow_route *route); +struct flow_offload *flow_offload_alloc(struct nf_conn *ct); void flow_offload_free(struct flow_offload *flow); +int flow_offload_route_init(struct flow_offload *flow, + const struct nf_flow_route *route); + int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow); struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table, struct flow_offload_tuple *tuple); -- cgit v1.2.3 From 8bb69f3b2918788435cbd5834c66682642c09fba Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Nov 2019 00:29:55 +0100 Subject: netfilter: nf_tables: add flowtable offload control plane This patch adds the NFTA_FLOWTABLE_FLAGS attribute that allows users to specify the NF_FLOWTABLE_HW_OFFLOAD flag. This patch also adds a new setup interface for the flowtable type to perform the flowtable offload block callback configuration. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_flow_table.h | 18 ++++++++++++++++++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ 2 files changed, 20 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index f000e8917487..ece09d36c7a6 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -8,6 +8,7 @@ #include #include #include +#include #include struct nf_flowtable; @@ -16,17 +17,27 @@ struct nf_flowtable_type { struct list_head list; int family; int (*init)(struct nf_flowtable *ft); + int (*setup)(struct nf_flowtable *ft, + struct net_device *dev, + enum flow_block_command cmd); void (*free)(struct nf_flowtable *ft); nf_hookfn *hook; struct module *owner; }; +enum nf_flowtable_flags { + NF_FLOWTABLE_HW_OFFLOAD = 0x1, +}; + struct nf_flowtable { struct list_head list; struct rhashtable rhashtable; int priority; const struct nf_flowtable_type *type; struct delayed_work gc_work; + unsigned int flags; + struct flow_block flow_block; + possible_net_t net; }; enum flow_offload_tuple_dir { @@ -131,4 +142,11 @@ unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, #define MODULE_ALIAS_NF_FLOWTABLE(family) \ MODULE_ALIAS("nf-flowtable-" __stringify(family)) +static inline int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd) +{ + return 0; +} + #endif /* _NF_FLOW_TABLE_H */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 81fed16fe2b2..bb9b049310df 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1518,6 +1518,7 @@ enum nft_object_attributes { * @NFTA_FLOWTABLE_HOOK: netfilter hook configuration(NLA_U32) * @NFTA_FLOWTABLE_USE: number of references to this flow table (NLA_U32) * @NFTA_FLOWTABLE_HANDLE: object handle (NLA_U64) + * @NFTA_FLOWTABLE_FLAGS: flags (NLA_U32) */ enum nft_flowtable_attributes { NFTA_FLOWTABLE_UNSPEC, @@ -1527,6 +1528,7 @@ enum nft_flowtable_attributes { NFTA_FLOWTABLE_USE, NFTA_FLOWTABLE_HANDLE, NFTA_FLOWTABLE_PAD, + NFTA_FLOWTABLE_FLAGS, __NFTA_FLOWTABLE_MAX }; #define NFTA_FLOWTABLE_MAX (__NFTA_FLOWTABLE_MAX - 1) -- cgit v1.2.3 From c29f74e0df7a02b8303bcdce93a7c0132d62577a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Nov 2019 00:29:56 +0100 Subject: netfilter: nf_flow_table: hardware offload support This patch adds the dataplane hardware offload to the flowtable infrastructure. Three new flags represent the hardware state of this flow: * FLOW_OFFLOAD_HW: This flow entry resides in the hardware. * FLOW_OFFLOAD_HW_DYING: This flow entry has been scheduled to be remove from hardware. This might be triggered by either packet path (via TCP RST/FIN packet) or via aging. * FLOW_OFFLOAD_HW_DEAD: This flow entry has been already removed from the hardware, the software garbage collector can remove it from the software flowtable. This patch supports for: * IPv4 only. * Aging via FLOW_CLS_STATS, no packet and byte counter synchronization at this stage. This patch also adds the action callback that specifies how to convert the flow entry into the flow_rule object that is passed to the driver. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/net/netfilter/nf_flow_table.h | 33 +++++++++++++++++++++++++++------ 2 files changed, 28 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f857f01234f7..9e6fb8524d91 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -848,6 +848,7 @@ enum tc_setup_type { TC_SETUP_ROOT_QDISC, TC_SETUP_QDISC_GRED, TC_SETUP_QDISC_TAPRIO, + TC_SETUP_FT, }; /* These structures hold the attributes of bpf state that are being passed diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index ece09d36c7a6..eea66de328d3 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -12,6 +12,9 @@ #include struct nf_flowtable; +struct nf_flow_rule; +struct flow_offload; +enum flow_offload_tuple_dir; struct nf_flowtable_type { struct list_head list; @@ -20,6 +23,10 @@ struct nf_flowtable_type { int (*setup)(struct nf_flowtable *ft, struct net_device *dev, enum flow_block_command cmd); + int (*action)(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule); void (*free)(struct nf_flowtable *ft); nf_hookfn *hook; struct module *owner; @@ -80,6 +87,9 @@ struct flow_offload_tuple_rhash { #define FLOW_OFFLOAD_DNAT 0x2 #define FLOW_OFFLOAD_DYING 0x4 #define FLOW_OFFLOAD_TEARDOWN 0x8 +#define FLOW_OFFLOAD_HW 0x10 +#define FLOW_OFFLOAD_HW_DYING 0x20 +#define FLOW_OFFLOAD_HW_DEAD 0x40 enum flow_offload_type { NF_FLOW_OFFLOAD_UNSPEC = 0, @@ -142,11 +152,22 @@ unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, #define MODULE_ALIAS_NF_FLOWTABLE(family) \ MODULE_ALIAS("nf-flowtable-" __stringify(family)) -static inline int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, - struct net_device *dev, - enum flow_block_command cmd) -{ - return 0; -} +void nf_flow_offload_add(struct nf_flowtable *flowtable, + struct flow_offload *flow); +void nf_flow_offload_del(struct nf_flowtable *flowtable, + struct flow_offload *flow); +void nf_flow_offload_stats(struct nf_flowtable *flowtable, + struct flow_offload *flow); + +void nf_flow_table_offload_flush(struct nf_flowtable *flowtable); +int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd); +int nf_flow_rule_route(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule); + +int nf_flow_table_offload_init(void); +void nf_flow_table_offload_exit(void); #endif /* _NF_FLOW_TABLE_H */ -- cgit v1.2.3 From 25da5eb32cd51383f6dca7aad252376f1979c075 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Oct 2019 16:02:50 +0100 Subject: netfilter: nft_meta: offload support for interface index This patch adds support for offloading the NFT_META_IIF selector. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_offload.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index 03cf5856d76f..ea7d1d78b92d 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -45,6 +45,7 @@ struct nft_flow_key { struct flow_dissector_key_ip ip; struct flow_dissector_key_vlan vlan; struct flow_dissector_key_eth_addrs eth_addrs; + struct flow_dissector_key_meta meta; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct nft_flow_match { -- cgit v1.2.3 From 975b992fdd4b38028d7c1dcf38286d6e7991c1b2 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Tue, 12 Nov 2019 00:34:29 +0100 Subject: net/mlx5: Add new chain for netfilter flow table offload Netfilter tables (nftables) implements a software datapath that comes after tc ingress datapath. The datapath supports offloading such rules via the flow table offload API. This API is currently only used by NFT and it doesn't provide the global priority in regards to tc offload, so we assume offloading such rules must come after tc. It does provide a flow table priority parameter, so we need to provide some supported priority range. For that, split fastpath prio to two, flow table offload and tc offload, with one dedicated priority chain for flow table offload. Next patch will re-use the multi chain API to access this chain by allowing access to this chain by the fdb_sub_namespace. Signed-off-by: Paul Blakey Reviewed-by: Mark Bloch Acked-by: Pablo Neira Ayuso Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 724d276ea133..4e5b84e66822 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -80,7 +80,8 @@ enum mlx5_flow_namespace_type { enum { FDB_BYPASS_PATH, - FDB_FAST_PATH, + FDB_TC_OFFLOAD, + FDB_FT_OFFLOAD, FDB_SLOW_PATH, }; -- cgit v1.2.3 From bd1903b7c4596ba6f7677d0dfefd05ba5876707d Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Wed, 13 Nov 2019 23:04:49 +0800 Subject: net: openvswitch: add hash info to upcall When using the kernel datapath, the upcall don't include skb hash info relatived. That will introduce some problem, because the hash of skb is important in kernel stack. For example, VXLAN module uses it to select UDP src port. The tx queue selection may also use the hash in stack. Hash is computed in different ways. Hash is random for a TCP socket, and hash may be computed in hardware, or software stack. Recalculation hash is not easy. Hash of TCP socket is computed: tcp_v4_connect -> sk_set_txhash (is random) __tcp_transmit_skb -> skb_set_hash_from_sk There will be one upcall, without information of skb hash, to ovs-vswitchd, for the first packet of a TCP session. The rest packets will be processed in Open vSwitch modules, hash kept. If this tcp session is forward to VXLAN module, then the UDP src port of first tcp packet is different from rest packets. TCP packets may come from the host or dockers, to Open vSwitch. To fix it, we store the hash info to upcall, and restore hash when packets sent back. +---------------+ +-------------------------+ | Docker/VMs | | ovs-vswitchd | +----+----------+ +-+--------------------+--+ | ^ | | | | | | upcall v restore packet hash (not recalculate) | +-+--------------------+--+ | tap netdev | | vxlan module +---------------> +--> Open vSwitch ko +--> or internal type | | +-------------------------+ Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-October/364062.html Signed-off-by: Tonghao Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 1887a451c388..a87b44cd5590 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -173,6 +173,7 @@ enum ovs_packet_cmd { * @OVS_PACKET_ATTR_LEN: Packet size before truncation. * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment * size. + * @OVS_PACKET_ATTR_HASH: Packet hash info (e.g. hash, sw_hash and l4_hash in skb). * * These attributes follow the &struct ovs_header within the Generic Netlink * payload for %OVS_PACKET_* commands. @@ -190,7 +191,8 @@ enum ovs_packet_attr { OVS_PACKET_ATTR_PROBE, /* Packet operation is a feature probe, error logging should be suppressed. */ OVS_PACKET_ATTR_MRU, /* Maximum received IP fragment size. */ - OVS_PACKET_ATTR_LEN, /* Packet size before truncation. */ + OVS_PACKET_ATTR_LEN, /* Packet size before truncation. */ + OVS_PACKET_ATTR_HASH, /* Packet hash. */ __OVS_PACKET_ATTR_MAX }; -- cgit v1.2.3 From 4d66c56f7efe122d09d06cd3ebfa52a43d51a9cb Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 13 Nov 2019 10:42:25 -0600 Subject: dt-bindings: net: dp83869: Add TI dp83869 phy Add dt bindings for the TI dp83869 Gigabit ethernet phy device. Signed-off-by: Dan Murphy CC: Rob Herring Signed-off-by: David S. Miller --- include/dt-bindings/net/ti-dp83869.h | 42 ++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 include/dt-bindings/net/ti-dp83869.h (limited to 'include') diff --git a/include/dt-bindings/net/ti-dp83869.h b/include/dt-bindings/net/ti-dp83869.h new file mode 100644 index 000000000000..218b1a64e975 --- /dev/null +++ b/include/dt-bindings/net/ti-dp83869.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Device Tree constants for the Texas Instruments DP83869 PHY + * + * Author: Dan Murphy + * + * Copyright: (C) 2019 Texas Instruments, Inc. + */ + +#ifndef _DT_BINDINGS_TI_DP83869_H +#define _DT_BINDINGS_TI_DP83869_H + +/* PHY CTRL bits */ +#define DP83869_PHYCR_FIFO_DEPTH_3_B_NIB 0x00 +#define DP83869_PHYCR_FIFO_DEPTH_4_B_NIB 0x01 +#define DP83869_PHYCR_FIFO_DEPTH_6_B_NIB 0x02 +#define DP83869_PHYCR_FIFO_DEPTH_8_B_NIB 0x03 + +/* IO_MUX_CFG - Clock output selection */ +#define DP83869_CLK_O_SEL_CHN_A_RCLK 0x0 +#define DP83869_CLK_O_SEL_CHN_B_RCLK 0x1 +#define DP83869_CLK_O_SEL_CHN_C_RCLK 0x2 +#define DP83869_CLK_O_SEL_CHN_D_RCLK 0x3 +#define DP83869_CLK_O_SEL_CHN_A_RCLK_DIV5 0x4 +#define DP83869_CLK_O_SEL_CHN_B_RCLK_DIV5 0x5 +#define DP83869_CLK_O_SEL_CHN_C_RCLK_DIV5 0x6 +#define DP83869_CLK_O_SEL_CHN_D_RCLK_DIV5 0x7 +#define DP83869_CLK_O_SEL_CHN_A_TCLK 0x8 +#define DP83869_CLK_O_SEL_CHN_B_TCLK 0x9 +#define DP83869_CLK_O_SEL_CHN_C_TCLK 0xa +#define DP83869_CLK_O_SEL_CHN_D_TCLK 0xb +#define DP83869_CLK_O_SEL_REF_CLK 0xc + +#define DP83869_RGMII_COPPER_ETHERNET 0x00 +#define DP83869_RGMII_1000_BASE 0x01 +#define DP83869_RGMII_100_BASE 0x02 +#define DP83869_RGMII_SGMII_BRIDGE 0x03 +#define DP83869_1000M_MEDIA_CONVERT 0x04 +#define DP83869_100M_MEDIA_CONVERT 0x05 +#define DP83869_SGMII_COPPER_ETHERNET 0x06 + +#endif -- cgit v1.2.3 From db205c766862edae48d64e69e2f2502e2a3e9135 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:37 +0100 Subject: vsock: remove vm_sockets_get_local_cid() vm_sockets_get_local_cid() is only used in virtio_transport_common.c. We can replace it calling the virtio_transport_get_ops() and using the get_local_cid() callback registered by the transport. Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/linux/vm_sockets.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/vm_sockets.h b/include/linux/vm_sockets.h index 33f1a2ecd905..7dd899ccb920 100644 --- a/include/linux/vm_sockets.h +++ b/include/linux/vm_sockets.h @@ -10,6 +10,4 @@ #include -int vm_sockets_get_local_cid(void); - #endif /* _VM_SOCKETS_H */ -- cgit v1.2.3 From 3603a2e991a82e5094c3107a792859b08342aed3 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:38 +0100 Subject: vsock: remove include/linux/vm_sockets.h file This header file now only includes the "uapi/linux/vm_sockets.h". We can include directly it when needed. Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/linux/vm_sockets.h | 13 ------------- include/net/af_vsock.h | 2 +- include/net/vsock_addr.h | 2 +- 3 files changed, 2 insertions(+), 15 deletions(-) delete mode 100644 include/linux/vm_sockets.h (limited to 'include') diff --git a/include/linux/vm_sockets.h b/include/linux/vm_sockets.h deleted file mode 100644 index 7dd899ccb920..000000000000 --- a/include/linux/vm_sockets.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * VMware vSockets Driver - * - * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. - */ - -#ifndef _VM_SOCKETS_H -#define _VM_SOCKETS_H - -#include - -#endif /* _VM_SOCKETS_H */ diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 80ea0f93d3f7..c660402b10f2 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -10,7 +10,7 @@ #include #include -#include +#include #include "vsock_addr.h" diff --git a/include/net/vsock_addr.h b/include/net/vsock_addr.h index 57d2db5c4bdf..cf8cc140d68d 100644 --- a/include/net/vsock_addr.h +++ b/include/net/vsock_addr.h @@ -8,7 +8,7 @@ #ifndef _VSOCK_ADDR_H_ #define _VSOCK_ADDR_H_ -#include +#include void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port); int vsock_addr_validate(const struct sockaddr_vm *addr); -- cgit v1.2.3 From fe502c4a38d97e5f8b9d5602af1f07f5abc529d2 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:39 +0100 Subject: vsock: add 'transport' member in the struct vsock_sock As a preparation to support multiple transports, this patch adds the 'transport' member at the 'struct vsock_sock'. This new field is initialized during the creation in the __vsock_create() function. This patch also renames the global 'transport' pointer to 'transport_single', since for now we're only supporting a single transport registered at run-time. Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/net/af_vsock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index c660402b10f2..a5e1e134261d 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -27,6 +27,7 @@ extern spinlock_t vsock_table_lock; struct vsock_sock { /* sk must be the first member. */ struct sock sk; + const struct vsock_transport *transport; struct sockaddr_vm local_addr; struct sockaddr_vm remote_addr; /* Links for the global tables of bound and connected sockets. */ -- cgit v1.2.3 From 4c7246dc45e2706770d5233f7ce1597a07e069ba Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:40 +0100 Subject: vsock/virtio: add transport parameter to the virtio_transport_reset_no_sock() We are going to add 'struct vsock_sock *' parameter to virtio_transport_get_ops(). In some cases, like in the virtio_transport_reset_no_sock(), we don't have any socket assigned to the packet received, so we can't use the virtio_transport_get_ops(). In order to allow virtio_transport_reset_no_sock() to use the '.send_pkt' callback from the 'vhost_transport' or 'virtio_transport', we add the 'struct virtio_transport *' to it and to its caller: virtio_transport_recv_pkt(). We moved the 'vhost_transport' and 'virtio_transport' definition, to pass their address to the virtio_transport_recv_pkt(). Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 07875ccc7bb5..b139f76060a6 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -150,7 +150,8 @@ virtio_transport_dgram_enqueue(struct vsock_sock *vsk, void virtio_transport_destruct(struct vsock_sock *vsk); -void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_recv_pkt(struct virtio_transport *t, + struct virtio_vsock_pkt *pkt); void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt); void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt); u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted); -- cgit v1.2.3 From daabfbca34ecfa936d3bf5219167c4c5e67db150 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:41 +0100 Subject: vsock: add 'struct vsock_sock *' param to vsock_core_get_transport() Since now the 'struct vsock_sock' object contains a pointer to the transport, this patch adds a parameter to the vsock_core_get_transport() to return the right transport assigned to the socket. This patch modifies also the virtio_transport_get_ops(), that uses the vsock_core_get_transport(), adding the 'struct vsock_sock *' parameter. Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/net/af_vsock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index a5e1e134261d..2ca67d048de4 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -166,7 +166,7 @@ static inline int vsock_core_init(const struct vsock_transport *t) void vsock_core_exit(void); /* The transport may downcast this to access transport-specific functions */ -const struct vsock_transport *vsock_core_get_transport(void); +const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk); /**** UTILS ****/ -- cgit v1.2.3 From b9f2b0ffde0c9b666b2b1672eb468b8f805a9b97 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:42 +0100 Subject: vsock: handle buffer_size sockopts in the core virtio_transport and vmci_transport handle the buffer_size sockopts in a very similar way. In order to support multiple transports, this patch moves this handling in the core to allow the user to change the options also if the socket is not yet assigned to any transport. This patch also adds the '.notify_buffer_size' callback in the 'struct virtio_transport' in order to inform the transport, when the buffer_size is changed by the user. It is also useful to limit the 'buffer_size' requested (e.g. virtio transports). Acked-by: Dexuan Cui Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 15 +-------------- include/net/af_vsock.h | 15 +++++++-------- 2 files changed, 8 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index b139f76060a6..71c81e0dc8f2 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -7,9 +7,6 @@ #include #include -#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE 128 -#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE (1024 * 256) -#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE (1024 * 256) #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4) #define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) @@ -25,11 +22,6 @@ enum { struct virtio_vsock_sock { struct vsock_sock *vsk; - /* Protected by lock_sock(sk_vsock(trans->vsk)) */ - u32 buf_size; - u32 buf_size_min; - u32 buf_size_max; - spinlock_t tx_lock; spinlock_t rx_lock; @@ -92,12 +84,6 @@ s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); int virtio_transport_do_socket_init(struct vsock_sock *vsk, struct vsock_sock *psk); -u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk); -u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk); -u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk); -void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val); -void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val); -void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val); int virtio_transport_notify_poll_in(struct vsock_sock *vsk, size_t target, @@ -124,6 +110,7 @@ int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, struct vsock_transport_send_notify_data *data); int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, ssize_t written, struct vsock_transport_send_notify_data *data); +void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val); u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk); bool virtio_transport_stream_is_active(struct vsock_sock *vsk); diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 2ca67d048de4..4b5d16840fd4 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -65,6 +65,11 @@ struct vsock_sock { bool sent_request; bool ignore_connecting_rst; + /* Protected by lock_sock(sk) */ + u64 buffer_size; + u64 buffer_min_size; + u64 buffer_max_size; + /* Private to transport. */ void *trans; }; @@ -140,18 +145,12 @@ struct vsock_transport { struct vsock_transport_send_notify_data *); int (*notify_send_post_enqueue)(struct vsock_sock *, ssize_t, struct vsock_transport_send_notify_data *); + /* sk_lock held by the caller */ + void (*notify_buffer_size)(struct vsock_sock *, u64 *); /* Shutdown. */ int (*shutdown)(struct vsock_sock *, int); - /* Buffer sizes. */ - void (*set_buffer_size)(struct vsock_sock *, u64); - void (*set_min_buffer_size)(struct vsock_sock *, u64); - void (*set_max_buffer_size)(struct vsock_sock *, u64); - u64 (*get_buffer_size)(struct vsock_sock *); - u64 (*get_min_buffer_size)(struct vsock_sock *); - u64 (*get_max_buffer_size)(struct vsock_sock *); - /* Addressing. */ u32 (*get_local_cid)(void); }; -- cgit v1.2.3 From b9ca2f5ff7784d46285a8f1b14419ac4645096f7 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:43 +0100 Subject: vsock: add vsock_create_connected() called by transports All transports call __vsock_create() with the same parameters, most of them depending on the parent socket. In order to simplify the VSOCK core APIs exposed to the transports, this patch adds the vsock_create_connected() callable from transports to create a new socket when a connection request is received. We also unexported the __vsock_create(). Suggested-by: Stefan Hajnoczi Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/net/af_vsock.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 4b5d16840fd4..fa1570dc9f5c 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -76,10 +76,7 @@ struct vsock_sock { s64 vsock_stream_has_data(struct vsock_sock *vsk); s64 vsock_stream_has_space(struct vsock_sock *vsk); -struct sock *__vsock_create(struct net *net, - struct socket *sock, - struct sock *parent, - gfp_t priority, unsigned short type, int kern); +struct sock *vsock_create_connected(struct sock *parent); /**** TRANSPORT ****/ -- cgit v1.2.3 From c0cfa2d8a788fcf45df5bf4070ab2474c88d543a Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:46 +0100 Subject: vsock: add multi-transports support This patch adds the support of multiple transports in the VSOCK core. With the multi-transports support, we can use vsock with nested VMs (using also different hypervisors) loading both guest->host and host->guest transports at the same time. Major changes: - vsock core module can be loaded regardless of the transports - vsock_core_init() and vsock_core_exit() are renamed to vsock_core_register() and vsock_core_unregister() - vsock_core_register() has a feature parameter (H2G, G2H, DGRAM) to identify which directions the transport can handle and if it's support DGRAM (only vmci) - each stream socket is assigned to a transport when the remote CID is set (during the connect() or when we receive a connection request on a listener socket). The remote CID is used to decide which transport to use: - remote CID <= VMADDR_CID_HOST will use guest->host transport; - remote CID == local_cid (guest->host transport) will use guest->host transport for loopback (host->guest transports don't support loopback); - remote CID > VMADDR_CID_HOST will use host->guest transport; - listener sockets are not bound to any transports since no transport operations are done on it. In this way we can create a listener socket, also if the transports are not loaded or with VMADDR_CID_ANY to listen on all transports. - DGRAM sockets are handled as before, since only the vmci_transport provides this feature. Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/net/af_vsock.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index fa1570dc9f5c..cf5c3691251b 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -91,6 +91,14 @@ struct vsock_transport_send_notify_data { u64 data2; /* Transport-defined. */ }; +/* Transport features flags */ +/* Transport provides host->guest communication */ +#define VSOCK_TRANSPORT_F_H2G 0x00000001 +/* Transport provides guest->host communication */ +#define VSOCK_TRANSPORT_F_G2H 0x00000002 +/* Transport provides DGRAM communication */ +#define VSOCK_TRANSPORT_F_DGRAM 0x00000004 + struct vsock_transport { /* Initialize/tear-down socket. */ int (*init)(struct vsock_sock *, struct vsock_sock *); @@ -154,12 +162,8 @@ struct vsock_transport { /**** CORE ****/ -int __vsock_core_init(const struct vsock_transport *t, struct module *owner); -static inline int vsock_core_init(const struct vsock_transport *t) -{ - return __vsock_core_init(t, THIS_MODULE); -} -void vsock_core_exit(void); +int vsock_core_register(const struct vsock_transport *t, int features); +void vsock_core_unregister(const struct vsock_transport *t); /* The transport may downcast this to access transport-specific functions */ const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk); @@ -190,6 +194,8 @@ struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst); void vsock_remove_sock(struct vsock_sock *vsk); void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); +int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk); +bool vsock_find_cid(unsigned int cid); /**** TAP ****/ -- cgit v1.2.3 From b1bba80a4376aef34de2b57bfb8834bd095703ed Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:47 +0100 Subject: vsock/vmci: register vmci_transport only when VMCI guest/host are active To allow other transports to be loaded with vmci_transport, we register the vmci_transport as G2H or H2G only when a VMCI guest or host is active. To do that, this patch adds a callback registered in the vmci driver that will be called when the host or guest becomes active. This callback will register the vmci_transport in the VSOCK core. Cc: Jorgen Hansen Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/linux/vmw_vmci_api.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/vmw_vmci_api.h b/include/linux/vmw_vmci_api.h index acd9fafe4fc6..f28907345c80 100644 --- a/include/linux/vmw_vmci_api.h +++ b/include/linux/vmw_vmci_api.h @@ -19,6 +19,7 @@ struct msghdr; typedef void (vmci_device_shutdown_fn) (void *device_registration, void *user_data); +typedef void (*vmci_vsock_cb) (bool is_host); int vmci_datagram_create_handle(u32 resource_id, u32 flags, vmci_datagram_recv_cb recv_cb, @@ -37,6 +38,7 @@ int vmci_doorbell_destroy(struct vmci_handle handle); int vmci_doorbell_notify(struct vmci_handle handle, u32 priv_flags); u32 vmci_get_context_id(void); bool vmci_is_context_owner(u32 context_id, kuid_t uid); +int vmci_register_vsock_callback(vmci_vsock_cb callback); int vmci_event_subscribe(u32 event, vmci_event_cb callback, void *callback_data, -- cgit v1.2.3 From 6a2c0962105ae8ceba182c4f616e0e41d7755591 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:48 +0100 Subject: vsock: prevent transport modules unloading This patch adds 'module' member in the 'struct vsock_transport' in order to get/put the transport module. This prevents the module unloading while sockets are assigned to it. We increase the module refcnt when a socket is assigned to a transport, and we decrease the module refcnt when the socket is destructed. Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/net/af_vsock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index cf5c3691251b..4206dc6d813f 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -100,6 +100,8 @@ struct vsock_transport_send_notify_data { #define VSOCK_TRANSPORT_F_DGRAM 0x00000004 struct vsock_transport { + struct module *module; + /* Initialize/tear-down socket. */ int (*init)(struct vsock_sock *, struct vsock_sock *); void (*destruct)(struct vsock_sock *); -- cgit v1.2.3 From 42bfba9eaa33dd4af0b50b87508062a41ec26653 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 14 Nov 2019 13:02:41 +0100 Subject: net/smc: immediate termination for SMCD link groups SMCD link group termination is called when peer signals its shutdown of its corresponding link group. For regular shutdowns no connections exist anymore. For abnormal shutdowns connections must be killed and their DMBs must be unregistered immediately. That means the SMCR method to delay the link group freeing several seconds does not fit. This patch adds immediate termination of a link group and its SMCD connections and makes sure all SMCD link group related cleanup steps are finished. Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- include/net/smc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/smc.h b/include/net/smc.h index 05174ae4f325..7c2082341bb3 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -37,6 +37,8 @@ struct smcd_dmb { #define ISM_EVENT_GID 1 #define ISM_EVENT_SWR 2 +#define ISM_ERROR 0xFFFF + struct smcd_event { u32 type; u32 code; -- cgit v1.2.3 From 5edd6b9cb8d7c6c346c93c52a53735591127e879 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 14 Nov 2019 13:02:43 +0100 Subject: net/smc: introduce bookkeeping of SMCD link groups If the ism module is unloaded return control from exit routine only, if all link groups are freed. If an IB device is thrown away return control from device removal only, if all link groups belonging to this device are freed. A counters for the total number of SMCD link groups per ISM device is introduced. ism module unloading continues only if the total number of SMCD link groups for all ISM devices is zero. ISM device removal continues only it the total number of SMCD link groups per ISM device has decreased to zero. Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- include/net/smc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/smc.h b/include/net/smc.h index 7c2082341bb3..646feb4bc75f 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -79,6 +79,8 @@ struct smcd_dev { bool pnetid_by_user; struct list_head lgr_list; spinlock_t lgr_lock; + atomic_t lgr_cnt; + wait_queue_head_t lgrs_deleted; u8 going_away : 1; }; -- cgit v1.2.3 From 5e2563650232a4d998a60b10d3679f65dd4c02fb Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 14 Nov 2019 17:03:27 +0200 Subject: net: mscc: ocelot: publish structure definitions to include/soc/mscc/ocelot.h We will be registering another switch driver based on ocelot, which lives under drivers/net/dsa. Make sure the Felix DSA front-end has the necessary abstractions to implement a new Ocelot driver instantiation. This includes the function prototypes for implementing DSA callbacks. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 539 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 539 insertions(+) create mode 100644 include/soc/mscc/ocelot.h (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h new file mode 100644 index 000000000000..a836afe8f68e --- /dev/null +++ b/include/soc/mscc/ocelot.h @@ -0,0 +1,539 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ +/* Copyright (c) 2017 Microsemi Corporation + */ + +#ifndef _SOC_MSCC_OCELOT_H +#define _SOC_MSCC_OCELOT_H + +#include +#include +#include +#include +#include + +#define IFH_INJ_BYPASS BIT(31) +#define IFH_INJ_POP_CNT_DISABLE (3 << 28) + +#define IFH_TAG_TYPE_C 0 +#define IFH_TAG_TYPE_S 1 + +#define IFH_REW_OP_NOOP 0x0 +#define IFH_REW_OP_DSCP 0x1 +#define IFH_REW_OP_ONE_STEP_PTP 0x2 +#define IFH_REW_OP_TWO_STEP_PTP 0x3 +#define IFH_REW_OP_ORIGIN_PTP 0x5 + +#define OCELOT_TAG_LEN 16 +#define OCELOT_SHORT_PREFIX_LEN 4 +#define OCELOT_LONG_PREFIX_LEN 16 + +#define OCELOT_SPEED_2500 0 +#define OCELOT_SPEED_1000 1 +#define OCELOT_SPEED_100 2 +#define OCELOT_SPEED_10 3 + +#define TARGET_OFFSET 24 +#define REG_MASK GENMASK(TARGET_OFFSET - 1, 0) +#define REG(reg, offset) [reg & REG_MASK] = offset + +#define REG_RESERVED_ADDR 0xffffffff +#define REG_RESERVED(reg) REG(reg, REG_RESERVED_ADDR) + +enum ocelot_target { + ANA = 1, + QS, + QSYS, + REW, + SYS, + S2, + HSIO, + PTP, + GCB, + TARGET_MAX, +}; + +enum ocelot_reg { + ANA_ADVLEARN = ANA << TARGET_OFFSET, + ANA_VLANMASK, + ANA_PORT_B_DOMAIN, + ANA_ANAGEFIL, + ANA_ANEVENTS, + ANA_STORMLIMIT_BURST, + ANA_STORMLIMIT_CFG, + ANA_ISOLATED_PORTS, + ANA_COMMUNITY_PORTS, + ANA_AUTOAGE, + ANA_MACTOPTIONS, + ANA_LEARNDISC, + ANA_AGENCTRL, + ANA_MIRRORPORTS, + ANA_EMIRRORPORTS, + ANA_FLOODING, + ANA_FLOODING_IPMC, + ANA_SFLOW_CFG, + ANA_PORT_MODE, + ANA_CUT_THRU_CFG, + ANA_PGID_PGID, + ANA_TABLES_ANMOVED, + ANA_TABLES_MACHDATA, + ANA_TABLES_MACLDATA, + ANA_TABLES_STREAMDATA, + ANA_TABLES_MACACCESS, + ANA_TABLES_MACTINDX, + ANA_TABLES_VLANACCESS, + ANA_TABLES_VLANTIDX, + ANA_TABLES_ISDXACCESS, + ANA_TABLES_ISDXTIDX, + ANA_TABLES_ENTRYLIM, + ANA_TABLES_PTP_ID_HIGH, + ANA_TABLES_PTP_ID_LOW, + ANA_TABLES_STREAMACCESS, + ANA_TABLES_STREAMTIDX, + ANA_TABLES_SEQ_HISTORY, + ANA_TABLES_SEQ_MASK, + ANA_TABLES_SFID_MASK, + ANA_TABLES_SFIDACCESS, + ANA_TABLES_SFIDTIDX, + ANA_MSTI_STATE, + ANA_OAM_UPM_LM_CNT, + ANA_SG_ACCESS_CTRL, + ANA_SG_CONFIG_REG_1, + ANA_SG_CONFIG_REG_2, + ANA_SG_CONFIG_REG_3, + ANA_SG_CONFIG_REG_4, + ANA_SG_CONFIG_REG_5, + ANA_SG_GCL_GS_CONFIG, + ANA_SG_GCL_TI_CONFIG, + ANA_SG_STATUS_REG_1, + ANA_SG_STATUS_REG_2, + ANA_SG_STATUS_REG_3, + ANA_PORT_VLAN_CFG, + ANA_PORT_DROP_CFG, + ANA_PORT_QOS_CFG, + ANA_PORT_VCAP_CFG, + ANA_PORT_VCAP_S1_KEY_CFG, + ANA_PORT_VCAP_S2_CFG, + ANA_PORT_PCP_DEI_MAP, + ANA_PORT_CPU_FWD_CFG, + ANA_PORT_CPU_FWD_BPDU_CFG, + ANA_PORT_CPU_FWD_GARP_CFG, + ANA_PORT_CPU_FWD_CCM_CFG, + ANA_PORT_PORT_CFG, + ANA_PORT_POL_CFG, + ANA_PORT_PTP_CFG, + ANA_PORT_PTP_DLY1_CFG, + ANA_PORT_PTP_DLY2_CFG, + ANA_PORT_SFID_CFG, + ANA_PFC_PFC_CFG, + ANA_PFC_PFC_TIMER, + ANA_IPT_OAM_MEP_CFG, + ANA_IPT_IPT, + ANA_PPT_PPT, + ANA_FID_MAP_FID_MAP, + ANA_AGGR_CFG, + ANA_CPUQ_CFG, + ANA_CPUQ_CFG2, + ANA_CPUQ_8021_CFG, + ANA_DSCP_CFG, + ANA_DSCP_REWR_CFG, + ANA_VCAP_RNG_TYPE_CFG, + ANA_VCAP_RNG_VAL_CFG, + ANA_VRAP_CFG, + ANA_VRAP_HDR_DATA, + ANA_VRAP_HDR_MASK, + ANA_DISCARD_CFG, + ANA_FID_CFG, + ANA_POL_PIR_CFG, + ANA_POL_CIR_CFG, + ANA_POL_MODE_CFG, + ANA_POL_PIR_STATE, + ANA_POL_CIR_STATE, + ANA_POL_STATE, + ANA_POL_FLOWC, + ANA_POL_HYST, + ANA_POL_MISC_CFG, + QS_XTR_GRP_CFG = QS << TARGET_OFFSET, + QS_XTR_RD, + QS_XTR_FRM_PRUNING, + QS_XTR_FLUSH, + QS_XTR_DATA_PRESENT, + QS_XTR_CFG, + QS_INJ_GRP_CFG, + QS_INJ_WR, + QS_INJ_CTRL, + QS_INJ_STATUS, + QS_INJ_ERR, + QS_INH_DBG, + QSYS_PORT_MODE = QSYS << TARGET_OFFSET, + QSYS_SWITCH_PORT_MODE, + QSYS_STAT_CNT_CFG, + QSYS_EEE_CFG, + QSYS_EEE_THRES, + QSYS_IGR_NO_SHARING, + QSYS_EGR_NO_SHARING, + QSYS_SW_STATUS, + QSYS_EXT_CPU_CFG, + QSYS_PAD_CFG, + QSYS_CPU_GROUP_MAP, + QSYS_QMAP, + QSYS_ISDX_SGRP, + QSYS_TIMED_FRAME_ENTRY, + QSYS_TFRM_MISC, + QSYS_TFRM_PORT_DLY, + QSYS_TFRM_TIMER_CFG_1, + QSYS_TFRM_TIMER_CFG_2, + QSYS_TFRM_TIMER_CFG_3, + QSYS_TFRM_TIMER_CFG_4, + QSYS_TFRM_TIMER_CFG_5, + QSYS_TFRM_TIMER_CFG_6, + QSYS_TFRM_TIMER_CFG_7, + QSYS_TFRM_TIMER_CFG_8, + QSYS_RED_PROFILE, + QSYS_RES_QOS_MODE, + QSYS_RES_CFG, + QSYS_RES_STAT, + QSYS_EGR_DROP_MODE, + QSYS_EQ_CTRL, + QSYS_EVENTS_CORE, + QSYS_QMAXSDU_CFG_0, + QSYS_QMAXSDU_CFG_1, + QSYS_QMAXSDU_CFG_2, + QSYS_QMAXSDU_CFG_3, + QSYS_QMAXSDU_CFG_4, + QSYS_QMAXSDU_CFG_5, + QSYS_QMAXSDU_CFG_6, + QSYS_QMAXSDU_CFG_7, + QSYS_PREEMPTION_CFG, + QSYS_CIR_CFG, + QSYS_EIR_CFG, + QSYS_SE_CFG, + QSYS_SE_DWRR_CFG, + QSYS_SE_CONNECT, + QSYS_SE_DLB_SENSE, + QSYS_CIR_STATE, + QSYS_EIR_STATE, + QSYS_SE_STATE, + QSYS_HSCH_MISC_CFG, + QSYS_TAG_CONFIG, + QSYS_TAS_PARAM_CFG_CTRL, + QSYS_PORT_MAX_SDU, + QSYS_PARAM_CFG_REG_1, + QSYS_PARAM_CFG_REG_2, + QSYS_PARAM_CFG_REG_3, + QSYS_PARAM_CFG_REG_4, + QSYS_PARAM_CFG_REG_5, + QSYS_GCL_CFG_REG_1, + QSYS_GCL_CFG_REG_2, + QSYS_PARAM_STATUS_REG_1, + QSYS_PARAM_STATUS_REG_2, + QSYS_PARAM_STATUS_REG_3, + QSYS_PARAM_STATUS_REG_4, + QSYS_PARAM_STATUS_REG_5, + QSYS_PARAM_STATUS_REG_6, + QSYS_PARAM_STATUS_REG_7, + QSYS_PARAM_STATUS_REG_8, + QSYS_PARAM_STATUS_REG_9, + QSYS_GCL_STATUS_REG_1, + QSYS_GCL_STATUS_REG_2, + REW_PORT_VLAN_CFG = REW << TARGET_OFFSET, + REW_TAG_CFG, + REW_PORT_CFG, + REW_DSCP_CFG, + REW_PCP_DEI_QOS_MAP_CFG, + REW_PTP_CFG, + REW_PTP_DLY1_CFG, + REW_RED_TAG_CFG, + REW_DSCP_REMAP_DP1_CFG, + REW_DSCP_REMAP_CFG, + REW_STAT_CFG, + REW_REW_STICKY, + REW_PPT, + SYS_COUNT_RX_OCTETS = SYS << TARGET_OFFSET, + SYS_COUNT_RX_UNICAST, + SYS_COUNT_RX_MULTICAST, + SYS_COUNT_RX_BROADCAST, + SYS_COUNT_RX_SHORTS, + SYS_COUNT_RX_FRAGMENTS, + SYS_COUNT_RX_JABBERS, + SYS_COUNT_RX_CRC_ALIGN_ERRS, + SYS_COUNT_RX_SYM_ERRS, + SYS_COUNT_RX_64, + SYS_COUNT_RX_65_127, + SYS_COUNT_RX_128_255, + SYS_COUNT_RX_256_1023, + SYS_COUNT_RX_1024_1526, + SYS_COUNT_RX_1527_MAX, + SYS_COUNT_RX_PAUSE, + SYS_COUNT_RX_CONTROL, + SYS_COUNT_RX_LONGS, + SYS_COUNT_RX_CLASSIFIED_DROPS, + SYS_COUNT_TX_OCTETS, + SYS_COUNT_TX_UNICAST, + SYS_COUNT_TX_MULTICAST, + SYS_COUNT_TX_BROADCAST, + SYS_COUNT_TX_COLLISION, + SYS_COUNT_TX_DROPS, + SYS_COUNT_TX_PAUSE, + SYS_COUNT_TX_64, + SYS_COUNT_TX_65_127, + SYS_COUNT_TX_128_511, + SYS_COUNT_TX_512_1023, + SYS_COUNT_TX_1024_1526, + SYS_COUNT_TX_1527_MAX, + SYS_COUNT_TX_AGING, + SYS_RESET_CFG, + SYS_SR_ETYPE_CFG, + SYS_VLAN_ETYPE_CFG, + SYS_PORT_MODE, + SYS_FRONT_PORT_MODE, + SYS_FRM_AGING, + SYS_STAT_CFG, + SYS_SW_STATUS, + SYS_MISC_CFG, + SYS_REW_MAC_HIGH_CFG, + SYS_REW_MAC_LOW_CFG, + SYS_TIMESTAMP_OFFSET, + SYS_CMID, + SYS_PAUSE_CFG, + SYS_PAUSE_TOT_CFG, + SYS_ATOP, + SYS_ATOP_TOT_CFG, + SYS_MAC_FC_CFG, + SYS_MMGT, + SYS_MMGT_FAST, + SYS_EVENTS_DIF, + SYS_EVENTS_CORE, + SYS_CNT, + SYS_PTP_STATUS, + SYS_PTP_TXSTAMP, + SYS_PTP_NXT, + SYS_PTP_CFG, + SYS_RAM_INIT, + SYS_CM_ADDR, + SYS_CM_DATA_WR, + SYS_CM_DATA_RD, + SYS_CM_OP, + SYS_CM_DATA, + S2_CORE_UPDATE_CTRL = S2 << TARGET_OFFSET, + S2_CORE_MV_CFG, + S2_CACHE_ENTRY_DAT, + S2_CACHE_MASK_DAT, + S2_CACHE_ACTION_DAT, + S2_CACHE_CNT_DAT, + S2_CACHE_TG_DAT, + PTP_PIN_CFG = PTP << TARGET_OFFSET, + PTP_PIN_TOD_SEC_MSB, + PTP_PIN_TOD_SEC_LSB, + PTP_PIN_TOD_NSEC, + PTP_CFG_MISC, + PTP_CLK_CFG_ADJ_CFG, + PTP_CLK_CFG_ADJ_FREQ, + GCB_SOFT_RST = GCB << TARGET_OFFSET, +}; + +enum ocelot_regfield { + ANA_ADVLEARN_VLAN_CHK, + ANA_ADVLEARN_LEARN_MIRROR, + ANA_ANEVENTS_FLOOD_DISCARD, + ANA_ANEVENTS_MSTI_DROP, + ANA_ANEVENTS_ACLKILL, + ANA_ANEVENTS_ACLUSED, + ANA_ANEVENTS_AUTOAGE, + ANA_ANEVENTS_VS2TTL1, + ANA_ANEVENTS_STORM_DROP, + ANA_ANEVENTS_LEARN_DROP, + ANA_ANEVENTS_AGED_ENTRY, + ANA_ANEVENTS_CPU_LEARN_FAILED, + ANA_ANEVENTS_AUTO_LEARN_FAILED, + ANA_ANEVENTS_LEARN_REMOVE, + ANA_ANEVENTS_AUTO_LEARNED, + ANA_ANEVENTS_AUTO_MOVED, + ANA_ANEVENTS_DROPPED, + ANA_ANEVENTS_CLASSIFIED_DROP, + ANA_ANEVENTS_CLASSIFIED_COPY, + ANA_ANEVENTS_VLAN_DISCARD, + ANA_ANEVENTS_FWD_DISCARD, + ANA_ANEVENTS_MULTICAST_FLOOD, + ANA_ANEVENTS_UNICAST_FLOOD, + ANA_ANEVENTS_DEST_KNOWN, + ANA_ANEVENTS_BUCKET3_MATCH, + ANA_ANEVENTS_BUCKET2_MATCH, + ANA_ANEVENTS_BUCKET1_MATCH, + ANA_ANEVENTS_BUCKET0_MATCH, + ANA_ANEVENTS_CPU_OPERATION, + ANA_ANEVENTS_DMAC_LOOKUP, + ANA_ANEVENTS_SMAC_LOOKUP, + ANA_ANEVENTS_SEQ_GEN_ERR_0, + ANA_ANEVENTS_SEQ_GEN_ERR_1, + ANA_TABLES_MACACCESS_B_DOM, + ANA_TABLES_MACTINDX_BUCKET, + ANA_TABLES_MACTINDX_M_INDEX, + QSYS_TIMED_FRAME_ENTRY_TFRM_VLD, + QSYS_TIMED_FRAME_ENTRY_TFRM_FP, + QSYS_TIMED_FRAME_ENTRY_TFRM_PORTNO, + QSYS_TIMED_FRAME_ENTRY_TFRM_TM_SEL, + QSYS_TIMED_FRAME_ENTRY_TFRM_TM_T, + SYS_RESET_CFG_CORE_ENA, + SYS_RESET_CFG_MEM_ENA, + SYS_RESET_CFG_MEM_INIT, + GCB_SOFT_RST_SWC_RST, + REGFIELD_MAX +}; + +enum ocelot_clk_pins { + ALT_PPS_PIN = 1, + EXT_CLK_PIN, + ALT_LDST_PIN, + TOD_ACC_PIN +}; + +struct ocelot_stat_layout { + u32 offset; + char name[ETH_GSTRING_LEN]; +}; + +enum ocelot_tag_prefix { + OCELOT_TAG_PREFIX_DISABLED = 0, + OCELOT_TAG_PREFIX_NONE, + OCELOT_TAG_PREFIX_SHORT, + OCELOT_TAG_PREFIX_LONG, +}; + +struct ocelot; + +struct ocelot_ops { + void (*pcs_init)(struct ocelot *ocelot, int port); + int (*reset)(struct ocelot *ocelot); +}; + +struct ocelot_port { + struct ocelot *ocelot; + + void __iomem *regs; + + /* Ingress default VLAN (pvid) */ + u16 pvid; + + /* Egress default VLAN (vid) */ + u16 vid; + + u8 ptp_cmd; + struct list_head skbs; + u8 ts_id; +}; + +struct ocelot { + struct device *dev; + + const struct ocelot_ops *ops; + struct regmap *targets[TARGET_MAX]; + struct regmap_field *regfields[REGFIELD_MAX]; + const u32 *const *map; + const struct ocelot_stat_layout *stats_layout; + unsigned int num_stats; + + int shared_queue_sz; + + struct net_device *hw_bridge_dev; + u16 bridge_mask; + u16 bridge_fwd_mask; + + struct ocelot_port **ports; + + u8 base_mac[ETH_ALEN]; + + /* Keep track of the vlan port masks */ + u32 vlan_mask[VLAN_N_VID]; + + u8 num_phys_ports; + u8 num_cpu_ports; + u8 cpu; + + u32 *lags; + + struct list_head multicast; + + /* Workqueue to check statistics for overflow with its lock */ + struct mutex stats_lock; + u64 *stats; + struct delayed_work stats_work; + struct workqueue_struct *stats_queue; + + u8 ptp:1; + struct ptp_clock *ptp_clock; + struct ptp_clock_info ptp_info; + struct hwtstamp_config hwtstamp_config; + /* Protects the PTP interface state */ + struct mutex ptp_lock; + /* Protects the PTP clock */ + spinlock_t ptp_clock_lock; + + void (*port_pcs_init)(struct ocelot_port *port); +}; + +#define ocelot_read_ix(ocelot, reg, gi, ri) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) +#define ocelot_read_gix(ocelot, reg, gi) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi)) +#define ocelot_read_rix(ocelot, reg, ri) __ocelot_read_ix(ocelot, reg, reg##_RSZ * (ri)) +#define ocelot_read(ocelot, reg) __ocelot_read_ix(ocelot, reg, 0) + +#define ocelot_write_ix(ocelot, val, reg, gi, ri) __ocelot_write_ix(ocelot, val, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) +#define ocelot_write_gix(ocelot, val, reg, gi) __ocelot_write_ix(ocelot, val, reg, reg##_GSZ * (gi)) +#define ocelot_write_rix(ocelot, val, reg, ri) __ocelot_write_ix(ocelot, val, reg, reg##_RSZ * (ri)) +#define ocelot_write(ocelot, val, reg) __ocelot_write_ix(ocelot, val, reg, 0) + +#define ocelot_rmw_ix(ocelot, val, m, reg, gi, ri) __ocelot_rmw_ix(ocelot, val, m, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) +#define ocelot_rmw_gix(ocelot, val, m, reg, gi) __ocelot_rmw_ix(ocelot, val, m, reg, reg##_GSZ * (gi)) +#define ocelot_rmw_rix(ocelot, val, m, reg, ri) __ocelot_rmw_ix(ocelot, val, m, reg, reg##_RSZ * (ri)) +#define ocelot_rmw(ocelot, val, m, reg) __ocelot_rmw_ix(ocelot, val, m, reg, 0) + +/* I/O */ +u32 ocelot_port_readl(struct ocelot_port *port, u32 reg); +void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg); +u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset); +void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset); +void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg, + u32 offset); + +/* Hardware initialization */ +int ocelot_regfields_init(struct ocelot *ocelot, + const struct reg_field *const regfields); +struct regmap *ocelot_regmap_init(struct ocelot *ocelot, struct resource *res); +void ocelot_set_cpu_port(struct ocelot *ocelot, int cpu, + enum ocelot_tag_prefix injection, + enum ocelot_tag_prefix extraction); +int ocelot_init(struct ocelot *ocelot); +void ocelot_deinit(struct ocelot *ocelot); +void ocelot_init_port(struct ocelot *ocelot, int port); + +/* DSA callbacks */ +void ocelot_port_enable(struct ocelot *ocelot, int port, + struct phy_device *phy); +void ocelot_port_disable(struct ocelot *ocelot, int port); +void ocelot_get_strings(struct ocelot *ocelot, int port, u32 sset, u8 *data); +void ocelot_get_ethtool_stats(struct ocelot *ocelot, int port, u64 *data); +int ocelot_get_sset_count(struct ocelot *ocelot, int port, int sset); +int ocelot_get_ts_info(struct ocelot *ocelot, int port, + struct ethtool_ts_info *info); +void ocelot_set_ageing_time(struct ocelot *ocelot, unsigned int msecs); +void ocelot_adjust_link(struct ocelot *ocelot, int port, + struct phy_device *phydev); +void ocelot_port_vlan_filtering(struct ocelot *ocelot, int port, + bool vlan_aware); +void ocelot_bridge_stp_state_set(struct ocelot *ocelot, int port, u8 state); +int ocelot_port_bridge_join(struct ocelot *ocelot, int port, + struct net_device *bridge); +int ocelot_port_bridge_leave(struct ocelot *ocelot, int port, + struct net_device *bridge); +int ocelot_fdb_dump(struct ocelot *ocelot, int port, + dsa_fdb_dump_cb_t *cb, void *data); +int ocelot_fdb_add(struct ocelot *ocelot, int port, + const unsigned char *addr, u16 vid, bool vlan_aware); +int ocelot_fdb_del(struct ocelot *ocelot, int port, + const unsigned char *addr, u16 vid); +int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid, + bool untagged); +int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid); +int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts); +void ocelot_get_hwtimestamp(struct ocelot *ocelot, struct timespec64 *ts); + +#endif -- cgit v1.2.3 From a030dfe1947310a2140b9e371dc9ebfab72c914f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 14 Nov 2019 17:03:28 +0200 Subject: net: mscc: ocelot: publish ocelot_sys.h to include/soc/mscc The Felix DSA driver needs to write to SYS_RAM_INIT_RAM_INIT for its own chip initialization process. Also update the MAINTAINERS file such that the headers exported by the ocelot driver are under the same maintainers' umbrella as the driver itself. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/soc/mscc/ocelot_sys.h | 144 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 include/soc/mscc/ocelot_sys.h (limited to 'include') diff --git a/include/soc/mscc/ocelot_sys.h b/include/soc/mscc/ocelot_sys.h new file mode 100644 index 000000000000..16f91e172bcb --- /dev/null +++ b/include/soc/mscc/ocelot_sys.h @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ +/* + * Microsemi Ocelot Switch driver + * + * Copyright (c) 2017 Microsemi Corporation + */ + +#ifndef _MSCC_OCELOT_SYS_H_ +#define _MSCC_OCELOT_SYS_H_ + +#define SYS_COUNT_RX_OCTETS_RSZ 0x4 + +#define SYS_COUNT_TX_OCTETS_RSZ 0x4 + +#define SYS_PORT_MODE_RSZ 0x4 + +#define SYS_PORT_MODE_DATA_WO_TS(x) (((x) << 5) & GENMASK(6, 5)) +#define SYS_PORT_MODE_DATA_WO_TS_M GENMASK(6, 5) +#define SYS_PORT_MODE_DATA_WO_TS_X(x) (((x) & GENMASK(6, 5)) >> 5) +#define SYS_PORT_MODE_INCL_INJ_HDR(x) (((x) << 3) & GENMASK(4, 3)) +#define SYS_PORT_MODE_INCL_INJ_HDR_M GENMASK(4, 3) +#define SYS_PORT_MODE_INCL_INJ_HDR_X(x) (((x) & GENMASK(4, 3)) >> 3) +#define SYS_PORT_MODE_INCL_XTR_HDR(x) (((x) << 1) & GENMASK(2, 1)) +#define SYS_PORT_MODE_INCL_XTR_HDR_M GENMASK(2, 1) +#define SYS_PORT_MODE_INCL_XTR_HDR_X(x) (((x) & GENMASK(2, 1)) >> 1) +#define SYS_PORT_MODE_INJ_HDR_ERR BIT(0) + +#define SYS_FRONT_PORT_MODE_RSZ 0x4 + +#define SYS_FRONT_PORT_MODE_HDX_MODE BIT(0) + +#define SYS_FRM_AGING_AGE_TX_ENA BIT(20) +#define SYS_FRM_AGING_MAX_AGE(x) ((x) & GENMASK(19, 0)) +#define SYS_FRM_AGING_MAX_AGE_M GENMASK(19, 0) + +#define SYS_STAT_CFG_STAT_CLEAR_SHOT(x) (((x) << 10) & GENMASK(16, 10)) +#define SYS_STAT_CFG_STAT_CLEAR_SHOT_M GENMASK(16, 10) +#define SYS_STAT_CFG_STAT_CLEAR_SHOT_X(x) (((x) & GENMASK(16, 10)) >> 10) +#define SYS_STAT_CFG_STAT_VIEW(x) ((x) & GENMASK(9, 0)) +#define SYS_STAT_CFG_STAT_VIEW_M GENMASK(9, 0) + +#define SYS_SW_STATUS_RSZ 0x4 + +#define SYS_SW_STATUS_PORT_RX_PAUSED BIT(0) + +#define SYS_MISC_CFG_PTP_RSRV_CLR BIT(1) +#define SYS_MISC_CFG_PTP_DIS_NEG_RO BIT(0) + +#define SYS_REW_MAC_HIGH_CFG_RSZ 0x4 + +#define SYS_REW_MAC_LOW_CFG_RSZ 0x4 + +#define SYS_TIMESTAMP_OFFSET_ETH_TYPE_CFG(x) (((x) << 6) & GENMASK(21, 6)) +#define SYS_TIMESTAMP_OFFSET_ETH_TYPE_CFG_M GENMASK(21, 6) +#define SYS_TIMESTAMP_OFFSET_ETH_TYPE_CFG_X(x) (((x) & GENMASK(21, 6)) >> 6) +#define SYS_TIMESTAMP_OFFSET_TIMESTAMP_OFFSET(x) ((x) & GENMASK(5, 0)) +#define SYS_TIMESTAMP_OFFSET_TIMESTAMP_OFFSET_M GENMASK(5, 0) + +#define SYS_PAUSE_CFG_RSZ 0x4 + +#define SYS_PAUSE_CFG_PAUSE_START(x) (((x) << 10) & GENMASK(18, 10)) +#define SYS_PAUSE_CFG_PAUSE_START_M GENMASK(18, 10) +#define SYS_PAUSE_CFG_PAUSE_START_X(x) (((x) & GENMASK(18, 10)) >> 10) +#define SYS_PAUSE_CFG_PAUSE_STOP(x) (((x) << 1) & GENMASK(9, 1)) +#define SYS_PAUSE_CFG_PAUSE_STOP_M GENMASK(9, 1) +#define SYS_PAUSE_CFG_PAUSE_STOP_X(x) (((x) & GENMASK(9, 1)) >> 1) +#define SYS_PAUSE_CFG_PAUSE_ENA BIT(0) + +#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_START(x) (((x) << 9) & GENMASK(17, 9)) +#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_START_M GENMASK(17, 9) +#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_START_X(x) (((x) & GENMASK(17, 9)) >> 9) +#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_STOP(x) ((x) & GENMASK(8, 0)) +#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_STOP_M GENMASK(8, 0) + +#define SYS_ATOP_RSZ 0x4 + +#define SYS_MAC_FC_CFG_RSZ 0x4 + +#define SYS_MAC_FC_CFG_FC_LINK_SPEED(x) (((x) << 26) & GENMASK(27, 26)) +#define SYS_MAC_FC_CFG_FC_LINK_SPEED_M GENMASK(27, 26) +#define SYS_MAC_FC_CFG_FC_LINK_SPEED_X(x) (((x) & GENMASK(27, 26)) >> 26) +#define SYS_MAC_FC_CFG_FC_LATENCY_CFG(x) (((x) << 20) & GENMASK(25, 20)) +#define SYS_MAC_FC_CFG_FC_LATENCY_CFG_M GENMASK(25, 20) +#define SYS_MAC_FC_CFG_FC_LATENCY_CFG_X(x) (((x) & GENMASK(25, 20)) >> 20) +#define SYS_MAC_FC_CFG_ZERO_PAUSE_ENA BIT(18) +#define SYS_MAC_FC_CFG_TX_FC_ENA BIT(17) +#define SYS_MAC_FC_CFG_RX_FC_ENA BIT(16) +#define SYS_MAC_FC_CFG_PAUSE_VAL_CFG(x) ((x) & GENMASK(15, 0)) +#define SYS_MAC_FC_CFG_PAUSE_VAL_CFG_M GENMASK(15, 0) + +#define SYS_MMGT_RELCNT(x) (((x) << 16) & GENMASK(31, 16)) +#define SYS_MMGT_RELCNT_M GENMASK(31, 16) +#define SYS_MMGT_RELCNT_X(x) (((x) & GENMASK(31, 16)) >> 16) +#define SYS_MMGT_FREECNT(x) ((x) & GENMASK(15, 0)) +#define SYS_MMGT_FREECNT_M GENMASK(15, 0) + +#define SYS_MMGT_FAST_FREEVLD(x) (((x) << 4) & GENMASK(7, 4)) +#define SYS_MMGT_FAST_FREEVLD_M GENMASK(7, 4) +#define SYS_MMGT_FAST_FREEVLD_X(x) (((x) & GENMASK(7, 4)) >> 4) +#define SYS_MMGT_FAST_RELVLD(x) ((x) & GENMASK(3, 0)) +#define SYS_MMGT_FAST_RELVLD_M GENMASK(3, 0) + +#define SYS_EVENTS_DIF_RSZ 0x4 + +#define SYS_EVENTS_DIF_EV_DRX(x) (((x) << 6) & GENMASK(8, 6)) +#define SYS_EVENTS_DIF_EV_DRX_M GENMASK(8, 6) +#define SYS_EVENTS_DIF_EV_DRX_X(x) (((x) & GENMASK(8, 6)) >> 6) +#define SYS_EVENTS_DIF_EV_DTX(x) ((x) & GENMASK(5, 0)) +#define SYS_EVENTS_DIF_EV_DTX_M GENMASK(5, 0) + +#define SYS_EVENTS_CORE_EV_FWR BIT(2) +#define SYS_EVENTS_CORE_EV_ANA(x) ((x) & GENMASK(1, 0)) +#define SYS_EVENTS_CORE_EV_ANA_M GENMASK(1, 0) + +#define SYS_CNT_GSZ 0x4 + +#define SYS_PTP_STATUS_PTP_TXSTAMP_OAM BIT(29) +#define SYS_PTP_STATUS_PTP_OVFL BIT(28) +#define SYS_PTP_STATUS_PTP_MESS_VLD BIT(27) +#define SYS_PTP_STATUS_PTP_MESS_ID(x) (((x) << 21) & GENMASK(26, 21)) +#define SYS_PTP_STATUS_PTP_MESS_ID_M GENMASK(26, 21) +#define SYS_PTP_STATUS_PTP_MESS_ID_X(x) (((x) & GENMASK(26, 21)) >> 21) +#define SYS_PTP_STATUS_PTP_MESS_TXPORT(x) (((x) << 16) & GENMASK(20, 16)) +#define SYS_PTP_STATUS_PTP_MESS_TXPORT_M GENMASK(20, 16) +#define SYS_PTP_STATUS_PTP_MESS_TXPORT_X(x) (((x) & GENMASK(20, 16)) >> 16) +#define SYS_PTP_STATUS_PTP_MESS_SEQ_ID(x) ((x) & GENMASK(15, 0)) +#define SYS_PTP_STATUS_PTP_MESS_SEQ_ID_M GENMASK(15, 0) + +#define SYS_PTP_TXSTAMP_PTP_TXSTAMP(x) ((x) & GENMASK(29, 0)) +#define SYS_PTP_TXSTAMP_PTP_TXSTAMP_M GENMASK(29, 0) +#define SYS_PTP_TXSTAMP_PTP_TXSTAMP_SEC BIT(31) + +#define SYS_PTP_NXT_PTP_NXT BIT(0) + +#define SYS_PTP_CFG_PTP_STAMP_WID(x) (((x) << 2) & GENMASK(7, 2)) +#define SYS_PTP_CFG_PTP_STAMP_WID_M GENMASK(7, 2) +#define SYS_PTP_CFG_PTP_STAMP_WID_X(x) (((x) & GENMASK(7, 2)) >> 2) +#define SYS_PTP_CFG_PTP_CF_ROLL_MODE(x) ((x) & GENMASK(1, 0)) +#define SYS_PTP_CFG_PTP_CF_ROLL_MODE_M GENMASK(1, 0) + +#define SYS_RAM_INIT_RAM_INIT BIT(1) +#define SYS_RAM_INIT_RAM_CFG_HOOK BIT(0) + +#endif -- cgit v1.2.3 From 8dce89aa5f3274e7c26132433840f63d129406bb Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 14 Nov 2019 17:03:29 +0200 Subject: net: dsa: ocelot: add tagger for Ocelot/Felix switches While it is entirely possible that this tagger format is in fact more generic than just these 2 switch families, I don't have that knowledge. The Seville switch in NXP T1040 has a similar frame format, but there are enough differences (e.g. DEST field starts at bit 57 instead of 56) that calling this file tag_vitesse.c is a bit of a stretch at the moment. The frame format has been listed in a comment so that people who add support for further Vitesse switches can rework this tagger while keeping compatibility with Felix. The "ocelot" name was chosen instead of "felix" because even the Ocelot switch can act as a DSA device when it is used in NPI mode, and the Felix tagger format is almost identical. Currently it is only used for the Felix switch embedded in the NXP LS1028A chip. The ABI for this tagger should be considered "not stable" at the moment. The DSA tag is always placed before the Ethernet header and therefore, we are using the long prefix for RX tags to avoid putting the DSA master port in promiscuous mode. Once there will be an API in DSA for drivers to request DSA masters to be in promiscuous mode unconditionally, we will switch to the "no prefix" extraction frame header, which will save 16 padding bytes for each RX frame. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 9507611a41f0..6767dc3f66c0 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -42,6 +42,7 @@ struct phylink_link_state; #define DSA_TAG_PROTO_8021Q_VALUE 12 #define DSA_TAG_PROTO_SJA1105_VALUE 13 #define DSA_TAG_PROTO_KSZ8795_VALUE 14 +#define DSA_TAG_PROTO_OCELOT_VALUE 15 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, @@ -59,6 +60,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_8021Q = DSA_TAG_PROTO_8021Q_VALUE, DSA_TAG_PROTO_SJA1105 = DSA_TAG_PROTO_SJA1105_VALUE, DSA_TAG_PROTO_KSZ8795 = DSA_TAG_PROTO_KSZ8795_VALUE, + DSA_TAG_PROTO_OCELOT = DSA_TAG_PROTO_OCELOT_VALUE, }; struct packet_type; -- cgit v1.2.3 From b7b3fc8dd95bc02bd30680da258e09dda55270db Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 15 Nov 2019 13:37:22 +0100 Subject: bpf: Support doubleword alignment in bpf_jit_binary_alloc Currently passing alignment greater than 4 to bpf_jit_binary_alloc does not work: in such cases it silently aligns only to 4 bytes. On s390, in order to load a constant from memory in a large (>512k) BPF program, one must use lgrl instruction, whose memory operand must be aligned on an 8-byte boundary. This patch makes it possible to request 8-byte alignment from bpf_jit_binary_alloc, and also makes it issue a warning when an unsupported alignment is requested. Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191115123722.58462-1-iii@linux.ibm.com --- include/linux/filter.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 7a6f8f6f1da4..ad80e9c6111c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -515,10 +515,12 @@ struct sock_fprog_kern { struct sock_filter *filter; }; +/* Some arches need doubleword alignment for their instructions and/or data */ +#define BPF_IMAGE_ALIGNMENT 8 + struct bpf_binary_header { u32 pages; - /* Some arches need word alignment for their instructions */ - u8 image[] __aligned(4); + u8 image[] __aligned(BPF_IMAGE_ALIGNMENT); }; struct bpf_prog { -- cgit v1.2.3 From 5964b2000f283ff5df366f718e0f083ebbaae977 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:03 -0800 Subject: bpf: Add bpf_arch_text_poke() helper Add bpf_arch_text_poke() helper that is used by BPF trampoline logic to patch nops/calls in kernel text into calls into BPF trampoline and to patch calls/nops inside BPF programs too. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191114185720.1641606-4-ast@kernel.org --- include/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7c7f518811a6..8b90db25348a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1157,4 +1157,12 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, } #endif /* CONFIG_INET */ +enum bpf_text_poke_type { + BPF_MOD_NOP_TO_CALL, + BPF_MOD_CALL_TO_CALL, + BPF_MOD_CALL_TO_NOP, +}; +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *addr1, void *addr2); + #endif /* _LINUX_BPF_H */ -- cgit v1.2.3 From fec56f5890d93fc2ed74166c397dc186b1c25951 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:04 -0800 Subject: bpf: Introduce BPF trampoline Introduce BPF trampoline concept to allow kernel code to call into BPF programs with practically zero overhead. The trampoline generation logic is architecture dependent. It's converting native calling convention into BPF calling convention. BPF ISA is 64-bit (even on 32-bit architectures). The registers R1 to R5 are used to pass arguments into BPF functions. The main BPF program accepts only single argument "ctx" in R1. Whereas CPU native calling convention is different. x86-64 is passing first 6 arguments in registers and the rest on the stack. x86-32 is passing first 3 arguments in registers. sparc64 is passing first 6 in registers. And so on. The trampolines between BPF and kernel already exist. BPF_CALL_x macros in include/linux/filter.h statically compile trampolines from BPF into kernel helpers. They convert up to five u64 arguments into kernel C pointers and integers. On 64-bit architectures this BPF_to_kernel trampolines are nops. On 32-bit architecture they're meaningful. The opposite job kernel_to_BPF trampolines is done by CAST_TO_U64 macros and __bpf_trace_##call() shim functions in include/trace/bpf_probe.h. They convert kernel function arguments into array of u64s that BPF program consumes via R1=ctx pointer. This patch set is doing the same job as __bpf_trace_##call() static trampolines, but dynamically for any kernel function. There are ~22k global kernel functions that are attachable via nop at function entry. The function arguments and types are described in BTF. The job of btf_distill_func_proto() function is to extract useful information from BTF into "function model" that architecture dependent trampoline generators will use to generate assembly code to cast kernel function arguments into array of u64s. For example the kernel function eth_type_trans has two pointers. They will be casted to u64 and stored into stack of generated trampoline. The pointer to that stack space will be passed into BPF program in R1. On x86-64 such generated trampoline will consume 16 bytes of stack and two stores of %rdi and %rsi into stack. The verifier will make sure that only two u64 are accessed read-only by BPF program. The verifier will also recognize the precise type of the pointers being accessed and will not allow typecasting of the pointer to a different type within BPF program. The tracing use case in the datacenter demonstrated that certain key kernel functions have (like tcp_retransmit_skb) have 2 or more kprobes that are always active. Other functions have both kprobe and kretprobe. So it is essential to keep both kernel code and BPF programs executing at maximum speed. Hence generated BPF trampoline is re-generated every time new program is attached or detached to maintain maximum performance. To avoid the high cost of retpoline the attached BPF programs are called directly. __bpf_prog_enter/exit() are used to support per-program execution stats. In the future this logic will be optimized further by adding support for bpf_stats_enabled_key inside generated assembly code. Introduction of preemptible and sleepable BPF programs will completely remove the need to call to __bpf_prog_enter/exit(). Detach of a BPF program from the trampoline should not fail. To avoid memory allocation in detach path the half of the page is used as a reserve and flipped after each attach/detach. 2k bytes is enough to call 40+ BPF programs directly which is enough for BPF tracing use cases. This limit can be increased in the future. BPF_TRACE_FENTRY programs have access to raw kernel function arguments while BPF_TRACE_FEXIT programs have access to kernel return value as well. Often kprobe BPF program remembers function arguments in a map while kretprobe fetches arguments from a map and analyzes them together with return value. BPF_TRACE_FEXIT accelerates this typical use case. Recursion prevention for kprobe BPF programs is done via per-cpu bpf_prog_active counter. In practice that turned out to be a mistake. It caused programs to randomly skip execution. The tracing tools missed results they were looking for. Hence BPF trampoline doesn't provide builtin recursion prevention. It's a job of BPF program itself and will be addressed in the follow up patches. BPF trampoline is intended to be used beyond tracing and fentry/fexit use cases in the future. For example to remove retpoline cost from XDP programs. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-5-ast@kernel.org --- include/linux/bpf.h | 105 +++++++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/bpf.h | 2 + 2 files changed, 107 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8b90db25348a..0d4c5c224d79 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -14,6 +14,8 @@ #include #include #include +#include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -384,6 +386,100 @@ struct bpf_prog_stats { struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); +struct btf_func_model { + u8 ret_size; + u8 nr_args; + u8 arg_size[MAX_BPF_FUNC_ARGS]; +}; + +/* Restore arguments before returning from trampoline to let original function + * continue executing. This flag is used for fentry progs when there are no + * fexit progs. + */ +#define BPF_TRAMP_F_RESTORE_REGS BIT(0) +/* Call original function after fentry progs, but before fexit progs. + * Makes sense for fentry/fexit, normal calls and indirect calls. + */ +#define BPF_TRAMP_F_CALL_ORIG BIT(1) +/* Skip current frame and return to parent. Makes sense for fentry/fexit + * programs only. Should not be used with normal calls and indirect calls. + */ +#define BPF_TRAMP_F_SKIP_FRAME BIT(2) + +/* Different use cases for BPF trampoline: + * 1. replace nop at the function entry (kprobe equivalent) + * flags = BPF_TRAMP_F_RESTORE_REGS + * fentry = a set of programs to run before returning from trampoline + * + * 2. replace nop at the function entry (kprobe + kretprobe equivalent) + * flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME + * orig_call = fentry_ip + MCOUNT_INSN_SIZE + * fentry = a set of program to run before calling original function + * fexit = a set of program to run after original function + * + * 3. replace direct call instruction anywhere in the function body + * or assign a function pointer for indirect call (like tcp_congestion_ops->cong_avoid) + * With flags = 0 + * fentry = a set of programs to run before returning from trampoline + * With flags = BPF_TRAMP_F_CALL_ORIG + * orig_call = original callback addr or direct function addr + * fentry = a set of program to run before calling original function + * fexit = a set of program to run after original function + */ +int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, + struct bpf_prog **fentry_progs, int fentry_cnt, + struct bpf_prog **fexit_progs, int fexit_cnt, + void *orig_call); +/* these two functions are called from generated trampoline */ +u64 notrace __bpf_prog_enter(void); +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); + +enum bpf_tramp_prog_type { + BPF_TRAMP_FENTRY, + BPF_TRAMP_FEXIT, + BPF_TRAMP_MAX +}; + +struct bpf_trampoline { + /* hlist for trampoline_table */ + struct hlist_node hlist; + /* serializes access to fields of this trampoline */ + struct mutex mutex; + refcount_t refcnt; + u64 key; + struct { + struct btf_func_model model; + void *addr; + } func; + /* list of BPF programs using this trampoline */ + struct hlist_head progs_hlist[BPF_TRAMP_MAX]; + /* Number of attached programs. A counter per kind. */ + int progs_cnt[BPF_TRAMP_MAX]; + /* Executable image of trampoline */ + void *image; + u64 selector; +}; +#ifdef CONFIG_BPF_JIT +struct bpf_trampoline *bpf_trampoline_lookup(u64 key); +int bpf_trampoline_link_prog(struct bpf_prog *prog); +int bpf_trampoline_unlink_prog(struct bpf_prog *prog); +void bpf_trampoline_put(struct bpf_trampoline *tr); +#else +static inline struct bpf_trampoline *bpf_trampoline_lookup(u64 key) +{ + return NULL; +} +static inline int bpf_trampoline_link_prog(struct bpf_prog *prog) +{ + return -ENOTSUPP; +} +static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog) +{ + return -ENOTSUPP; +} +static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {} +#endif + struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; @@ -398,6 +494,9 @@ struct bpf_prog_aux { bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ + enum bpf_tramp_prog_type trampoline_prog_type; + struct bpf_trampoline *trampoline; + struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; /* function name for valid attach_btf_id */ @@ -784,6 +883,12 @@ int btf_struct_access(struct bpf_verifier_log *log, u32 *next_btf_id); u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *, int); +int btf_distill_func_proto(struct bpf_verifier_log *log, + struct btf *btf, + const struct btf_type *func_proto, + const char *func_name, + struct btf_func_model *m); + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index df6809a76404..69c200e6e696 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -201,6 +201,8 @@ enum bpf_attach_type { BPF_CGROUP_GETSOCKOPT, BPF_CGROUP_SETSOCKOPT, BPF_TRACE_RAW_TP, + BPF_TRACE_FENTRY, + BPF_TRACE_FEXIT, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 9cc31b3a092d9bf2a18f09ad77e727ddb42a5b1e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:14 -0800 Subject: bpf: Fix race in btf_resolve_helper_id() btf_resolve_helper_id() caching logic is a bit racy, since under root the verifier can verify several programs in parallel. Fix it with READ/WRITE_ONCE. Fix the type as well, since error is also recorded. Fixes: a7658e1a4164 ("bpf: Check types of arguments passed into helpers") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191114185720.1641606-15-ast@kernel.org --- include/linux/bpf.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0d4c5c224d79..cb5a356381f5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -248,7 +248,7 @@ struct bpf_func_proto { }; enum bpf_arg_type arg_type[5]; }; - u32 *btf_id; /* BTF ids of arguments */ + int *btf_id; /* BTF ids of arguments */ }; /* bpf_context is intentionally undefined structure. Pointer to bpf_context is @@ -881,7 +881,8 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); -u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *, int); +int btf_resolve_helper_id(struct bpf_verifier_log *log, + const struct bpf_func_proto *fn, int); int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf *btf, -- cgit v1.2.3 From 5c27d8d76ce810c6254cf5917a6019d824f34bd2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 13 Nov 2019 14:08:01 +0100 Subject: netfilter: nf_flow_table_offload: add IPv6 support Add nf_flow_rule_route_ipv6() and use it from the IPv6 and the inet flowtable type definitions. Rename the nf_flow_rule_route() function to nf_flow_rule_route_ipv4(). Adjust maximum number of actions, which now becomes 16 to leave sufficient room for the IPv6 address mangling for NAT. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index eea66de328d3..f0897b3c97fb 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -163,9 +163,12 @@ void nf_flow_table_offload_flush(struct nf_flowtable *flowtable); int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, struct net_device *dev, enum flow_block_command cmd); -int nf_flow_rule_route(struct net *net, const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule); +int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule); +int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule); int nf_flow_table_offload_init(void); void nf_flow_table_offload_exit(void); -- cgit v1.2.3 From 91cc1a99740e2ed1d903b5906afb470cc5a07379 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:15 -0800 Subject: bpf: Annotate context types Annotate BPF program context types with program-side type and kernel-side type. This type information is used by the verifier. btf_get_prog_ctx_type() is used in the later patches to verify that BTF type of ctx in BPF program matches to kernel expected ctx type. For example, the XDP program type is: BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp, struct xdp_md, struct xdp_buff) That means that XDP program should be written as: int xdp_prog(struct xdp_md *ctx) { ... } Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-16-ast@kernel.org --- include/linux/bpf.h | 11 ++++++- include/linux/bpf_types.h | 78 +++++++++++++++++++++++++++++++---------------- 2 files changed, 62 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cb5a356381f5..9c48f11fe56e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -747,7 +747,7 @@ DECLARE_PER_CPU(int, bpf_prog_active); extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; -#define BPF_PROG_TYPE(_id, _name) \ +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ extern const struct bpf_prog_ops _name ## _prog_ops; \ extern const struct bpf_verifier_ops _name ## _verifier_ops; #define BPF_MAP_TYPE(_id, _ops) \ @@ -1213,6 +1213,15 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, #endif #ifdef CONFIG_INET +struct sk_reuseport_kern { + struct sk_buff *skb; + struct sock *sk; + struct sock *selected_sk; + void *data_end; + u32 hash; + u32 reuseport_id; + bool bind_inany; +}; bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index de14872b01ba..93740b3614d7 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -2,42 +2,68 @@ /* internal file - do not include directly */ #ifdef CONFIG_NET -BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter) -BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act) -BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act) -BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp) +BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp, + struct xdp_md, struct xdp_buff) #ifdef CONFIG_CGROUP_BPF -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock, + struct bpf_sock, struct sock) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr, + struct bpf_sock_addr, struct bpf_sock_addr_kern) #endif -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local) -BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) -BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) -BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops, + struct bpf_sock_ops, struct bpf_sock_ops_kern) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg, + struct sk_msg_md, struct sk_msg) +BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector, + struct __sk_buff, struct bpf_flow_dissector) #endif #ifdef CONFIG_BPF_EVENTS -BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) -BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) -BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) -BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) -BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) -BPF_PROG_TYPE(BPF_PROG_TYPE_TRACING, tracing) +BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe, + bpf_user_pt_regs_t, struct pt_regs) +BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint, + __u64, u64) +BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event, + struct bpf_perf_event_data, struct bpf_perf_event_data_kern) +BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint, + struct bpf_raw_tracepoint_args, u64) +BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable, + struct bpf_raw_tracepoint_args, u64) +BPF_PROG_TYPE(BPF_PROG_TYPE_TRACING, tracing, + void *, void *) #endif #ifdef CONFIG_CGROUP_BPF -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev, + struct bpf_cgroup_dev_ctx, struct bpf_cgroup_dev_ctx) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl, + struct bpf_sysctl, struct bpf_sysctl_kern) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt, + struct bpf_sockopt, struct bpf_sockopt_kern) #endif #ifdef CONFIG_BPF_LIRC_MODE2 -BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) +BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2, + __u32, u32) #endif #ifdef CONFIG_INET -BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport, + struct sk_reuseport_md, struct sk_reuseport_kern) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) -- cgit v1.2.3 From 8c1b6e69dcc1e11bd24111e3734dd740aaf3fda1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:16 -0800 Subject: bpf: Compare BTF types of functions arguments with actual types Make the verifier check that BTF types of function arguments match actual types passed into top-level BPF program and into BPF-to-BPF calls. If types match such BPF programs and sub-programs will have full support of BPF trampoline. If types mismatch the trampoline has to be conservative. It has to save/restore five program arguments and assume 64-bit scalars. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191114185720.1641606-17-ast@kernel.org --- include/linux/bpf.h | 8 ++++++++ include/linux/bpf_verifier.h | 1 + 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9c48f11fe56e..c70bf04726b4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -480,6 +480,10 @@ static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog) static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {} #endif +struct bpf_func_info_aux { + bool unreliable; +}; + struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; @@ -494,6 +498,7 @@ struct bpf_prog_aux { bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ + bool func_proto_unreliable; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; @@ -518,6 +523,7 @@ struct bpf_prog_aux { struct bpf_prog_offload *offload; struct btf *btf; struct bpf_func_info *func_info; + struct bpf_func_info_aux *func_info_aux; /* bpf_line_info loaded from userspace. linfo->insn_off * has the xlated insn offset. * Both the main and sub prog share the same linfo. @@ -890,6 +896,8 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, const char *func_name, struct btf_func_model *m); +int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog); + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6e7284ea1468..cdd08bf0ec06 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -343,6 +343,7 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) #define BPF_MAX_SUBPROGS 256 struct bpf_subprog_info { + /* 'start' has to be the first field otherwise find_subprog() won't work */ u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ -- cgit v1.2.3 From 5b92a28aae4dd0f88778d540ecfdcdaec5a41723 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:17 -0800 Subject: bpf: Support attaching tracing BPF program to other BPF programs Allow FENTRY/FEXIT BPF programs to attach to other BPF programs of any type including their subprograms. This feature allows snooping on input and output packets in XDP, TC programs including their return values. In order to do that the verifier needs to track types not only of vmlinux, but types of other BPF programs as well. The verifier also needs to translate uapi/linux/bpf.h types used by networking programs into kernel internal BTF types used by FENTRY/FEXIT BPF programs. In some cases LLVM optimizations can remove arguments from BPF subprograms without adjusting BTF info that LLVM backend knows. When BTF info disagrees with actual types that the verifiers sees the BPF trampoline has to fallback to conservative and treat all arguments as u64. The FENTRY/FEXIT program can still attach to such subprograms, but it won't be able to recognize pointer types like 'struct sk_buff *' and it won't be able to pass them to bpf_skb_output() for dumping packets to user space. The FENTRY/FEXIT program would need to use bpf_probe_read_kernel() instead. The BPF_PROG_LOAD command is extended with attach_prog_fd field. When it's set to zero the attach_btf_id is one vmlinux BTF type ids. When attach_prog_fd points to previously loaded BPF program the attach_btf_id is BTF type id of main function or one of its subprograms. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-18-ast@kernel.org --- include/linux/bpf.h | 1 + include/linux/btf.h | 1 + include/uapi/linux/bpf.h | 1 + 3 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c70bf04726b4..5b81cde47314 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -495,6 +495,7 @@ struct bpf_prog_aux { u32 func_cnt; /* used by non-func prog as the number of func progs */ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + struct bpf_prog *linked_prog; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ diff --git a/include/linux/btf.h b/include/linux/btf.h index 9dee00859c5f..79d4abc2556a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -88,6 +88,7 @@ static inline bool btf_type_is_func_proto(const struct btf_type *t) const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); +struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 69c200e6e696..4842a134b202 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -425,6 +425,7 @@ union bpf_attr { __aligned_u64 line_info; /* line info */ __u32 line_info_cnt; /* number of bpf_line_info records */ __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + __u32 attach_prog_fd; /* 0 to attach to vmlinux */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ -- cgit v1.2.3 From c3f812cea0d7006469d1cf33a4a9f0a12bb4b3a3 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Thu, 14 Nov 2019 14:13:00 -0800 Subject: page_pool: do not release pool until inflight == 0. The page pool keeps track of the number of pages in flight, and it isn't safe to remove the pool until all pages are returned. Disallow removing the pool until all pages are back, so the pool is always available for page producers. Make the page pool responsible for its own delayed destruction instead of relying on XDP, so the page pool can be used without the xdp memory model. When all pages are returned, free the pool and notify xdp if the pool is registered with the xdp memory system. Have the callback perform a table walk since some drivers (cpsw) may share the pool among multiple xdp_rxq_info. Note that the increment of pages_state_release_cnt may result in inflight == 0, resulting in the pool being released. Fixes: d956a048cd3f ("xdp: force mem allocator removal and periodic warning") Signed-off-by: Jonathan Lemon Acked-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/net/page_pool.h | 52 +++++++++++++--------------------------------- include/net/xdp_priv.h | 4 ---- include/trace/events/xdp.h | 19 ++++------------- 3 files changed, 18 insertions(+), 57 deletions(-) (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 2cbcdbdec254..1121faa99c12 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -70,7 +70,12 @@ struct page_pool_params { struct page_pool { struct page_pool_params p; - u32 pages_state_hold_cnt; + struct delayed_work release_dw; + void (*disconnect)(void *); + unsigned long defer_start; + unsigned long defer_warn; + + u32 pages_state_hold_cnt; /* * Data structure for allocation side @@ -129,25 +134,19 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) struct page_pool *page_pool_create(const struct page_pool_params *params); -void __page_pool_free(struct page_pool *pool); -static inline void page_pool_free(struct page_pool *pool) -{ - /* When page_pool isn't compiled-in, net/core/xdp.c doesn't - * allow registering MEM_TYPE_PAGE_POOL, but shield linker. - */ #ifdef CONFIG_PAGE_POOL - __page_pool_free(pool); -#endif -} - -/* Drivers use this instead of page_pool_free */ +void page_pool_destroy(struct page_pool *pool); +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)); +#else static inline void page_pool_destroy(struct page_pool *pool) { - if (!pool) - return; +} - page_pool_free(pool); +static inline void page_pool_use_xdp_mem(struct page_pool *pool, + void (*disconnect)(void *)) +{ } +#endif /* Never call this directly, use helpers below */ void __page_pool_put_page(struct page_pool *pool, @@ -170,24 +169,6 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, __page_pool_put_page(pool, page, true); } -/* API user MUST have disconnected alloc-side (not allowed to call - * page_pool_alloc_pages()) before calling this. The free-side can - * still run concurrently, to handle in-flight packet-pages. - * - * A request to shutdown can fail (with false) if there are still - * in-flight packet-pages. - */ -bool __page_pool_request_shutdown(struct page_pool *pool); -static inline bool page_pool_request_shutdown(struct page_pool *pool) -{ - bool safe_to_remove = false; - -#ifdef CONFIG_PAGE_POOL - safe_to_remove = __page_pool_request_shutdown(pool); -#endif - return safe_to_remove; -} - /* Disconnects a page (from a page_pool). API users can have a need * to disconnect a page (from a page_pool), to allow it to be used as * a regular page (that will eventually be returned to the normal @@ -216,11 +197,6 @@ static inline bool is_page_pool_compiled_in(void) #endif } -static inline void page_pool_get(struct page_pool *pool) -{ - refcount_inc(&pool->user_cnt); -} - static inline bool page_pool_put(struct page_pool *pool) { return refcount_dec_and_test(&pool->user_cnt); diff --git a/include/net/xdp_priv.h b/include/net/xdp_priv.h index 6a8cba6ea79a..a9d5b7603b89 100644 --- a/include/net/xdp_priv.h +++ b/include/net/xdp_priv.h @@ -12,12 +12,8 @@ struct xdp_mem_allocator { struct page_pool *page_pool; struct zero_copy_allocator *zc_alloc; }; - int disconnect_cnt; - unsigned long defer_start; struct rhash_head node; struct rcu_head rcu; - struct delayed_work defer_wq; - unsigned long defer_warn; }; #endif /* __LINUX_NET_XDP_PRIV_H__ */ diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index c7e3c9c5bad3..a7378bcd9928 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -317,19 +317,15 @@ __MEM_TYPE_MAP(__MEM_TYPE_TP_FN) TRACE_EVENT(mem_disconnect, - TP_PROTO(const struct xdp_mem_allocator *xa, - bool safe_to_remove, bool force), + TP_PROTO(const struct xdp_mem_allocator *xa), - TP_ARGS(xa, safe_to_remove, force), + TP_ARGS(xa), TP_STRUCT__entry( __field(const struct xdp_mem_allocator *, xa) __field(u32, mem_id) __field(u32, mem_type) __field(const void *, allocator) - __field(bool, safe_to_remove) - __field(bool, force) - __field(int, disconnect_cnt) ), TP_fast_assign( @@ -337,19 +333,12 @@ TRACE_EVENT(mem_disconnect, __entry->mem_id = xa->mem.id; __entry->mem_type = xa->mem.type; __entry->allocator = xa->allocator; - __entry->safe_to_remove = safe_to_remove; - __entry->force = force; - __entry->disconnect_cnt = xa->disconnect_cnt; ), - TP_printk("mem_id=%d mem_type=%s allocator=%p" - " safe_to_remove=%s force=%s disconnect_cnt=%d", + TP_printk("mem_id=%d mem_type=%s allocator=%p", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), - __entry->allocator, - __entry->safe_to_remove ? "true" : "false", - __entry->force ? "true" : "false", - __entry->disconnect_cnt + __entry->allocator ) ); -- cgit v1.2.3 From 1e0bd5a091e5d9e0f1d5b0e6329b87bb1792f784 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:02 -0800 Subject: bpf: Switch bpf_map ref counter to atomic64_t so bpf_map_inc() never fails 92117d8443bc ("bpf: fix refcnt overflow") turned refcounting of bpf_map into potentially failing operation, when refcount reaches BPF_MAX_REFCNT limit (32k). Due to using 32-bit counter, it's possible in practice to overflow refcounter and make it wrap around to 0, causing erroneous map free, while there are still references to it, causing use-after-free problems. But having a failing refcounting operations are problematic in some cases. One example is mmap() interface. After establishing initial memory-mapping, user is allowed to arbitrarily map/remap/unmap parts of mapped memory, arbitrarily splitting it into multiple non-contiguous regions. All this happening without any control from the users of mmap subsystem. Rather mmap subsystem sends notifications to original creator of memory mapping through open/close callbacks, which are optionally specified during initial memory mapping creation. These callbacks are used to maintain accurate refcount for bpf_map (see next patch in this series). The problem is that open() callback is not supposed to fail, because memory-mapped resource is set up and properly referenced. This is posing a problem for using memory-mapping with BPF maps. One solution to this is to maintain separate refcount for just memory-mappings and do single bpf_map_inc/bpf_map_put when it goes from/to zero, respectively. There are similar use cases in current work on tcp-bpf, necessitating extra counter as well. This seems like a rather unfortunate and ugly solution that doesn't scale well to various new use cases. Another approach to solve this is to use non-failing refcount_t type, which uses 32-bit counter internally, but, once reaching overflow state at UINT_MAX, stays there. This utlimately causes memory leak, but prevents use after free. But given refcounting is not the most performance-critical operation with BPF maps (it's not used from running BPF program code), we can also just switch to 64-bit counter that can't overflow in practice, potentially disadvantaging 32-bit platforms a tiny bit. This simplifies semantics and allows above described scenarios to not worry about failing refcount increment operation. In terms of struct bpf_map size, we are still good and use the same amount of space: BEFORE (3 cache lines, 8 bytes of padding at the end): struct bpf_map { const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ int spin_lock_off; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ u32 btf_key_type_id; /* 56 4 */ u32 btf_value_type_id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct btf * btf; /* 64 8 */ struct bpf_map_memory memory; /* 72 16 */ bool unpriv_array; /* 88 1 */ bool frozen; /* 89 1 */ /* XXX 38 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ atomic_t refcnt __attribute__((__aligned__(64))); /* 128 4 */ atomic_t usercnt; /* 132 4 */ struct work_struct work; /* 136 32 */ char name[16]; /* 168 16 */ /* size: 192, cachelines: 3, members: 21 */ /* sum members: 146, holes: 1, sum holes: 38 */ /* padding: 8 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 38 */ } __attribute__((__aligned__(64))); AFTER (same 3 cache lines, no extra padding now): struct bpf_map { const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ int spin_lock_off; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ u32 btf_key_type_id; /* 56 4 */ u32 btf_value_type_id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct btf * btf; /* 64 8 */ struct bpf_map_memory memory; /* 72 16 */ bool unpriv_array; /* 88 1 */ bool frozen; /* 89 1 */ /* XXX 38 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ atomic64_t refcnt __attribute__((__aligned__(64))); /* 128 8 */ atomic64_t usercnt; /* 136 8 */ struct work_struct work; /* 144 32 */ char name[16]; /* 176 16 */ /* size: 192, cachelines: 3, members: 21 */ /* sum members: 154, holes: 1, sum holes: 38 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 38 */ } __attribute__((__aligned__(64))); This patch, while modifying all users of bpf_map_inc, also cleans up its interface to match bpf_map_put with separate operations for bpf_map_inc and bpf_map_inc_with_uref (to match bpf_map_put and bpf_map_put_with_uref, respectively). Also, given there are no users of bpf_map_inc_not_zero specifying uref=true, remove uref flag and default to uref=false internally. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191117172806.2195367-2-andriin@fb.com --- include/linux/bpf.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5b81cde47314..34a34445c009 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -103,8 +103,8 @@ struct bpf_map { /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ - atomic_t refcnt ____cacheline_aligned; - atomic_t usercnt; + atomic64_t refcnt ____cacheline_aligned; + atomic64_t usercnt; struct work_struct work; char name[BPF_OBJ_NAME_LEN]; }; @@ -783,9 +783,9 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); -struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); -struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map, - bool uref); +void bpf_map_inc(struct bpf_map *map); +void bpf_map_inc_with_uref(struct bpf_map *map); +struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); -- cgit v1.2.3 From 85192dbf4de08795afe2b88e52a36fc6abfc3dba Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:03 -0800 Subject: bpf: Convert bpf_prog refcnt to atomic64_t Similarly to bpf_map's refcnt/usercnt, convert bpf_prog's refcnt to atomic64 and remove artificial 32k limit. This allows to make bpf_prog's refcounting non-failing, simplifying logic of users of bpf_prog_add/bpf_prog_inc. Validated compilation by running allyesconfig kernel build. Suggested-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191117172806.2195367-3-andriin@fb.com --- include/linux/bpf.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 34a34445c009..fb606dc61a3a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -485,7 +485,7 @@ struct bpf_func_info_aux { }; struct bpf_prog_aux { - atomic_t refcnt; + atomic64_t refcnt; u32 used_map_cnt; u32 max_ctx_offset; u32 max_pkt_offset; @@ -770,9 +770,9 @@ extern const struct bpf_verifier_ops xdp_analyzer_ops; struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv); -struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i); +void bpf_prog_add(struct bpf_prog *prog, int i); void bpf_prog_sub(struct bpf_prog *prog, int i); -struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog); +void bpf_prog_inc(struct bpf_prog *prog); struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); int __bpf_prog_charge(struct user_struct *user, u32 pages); @@ -912,10 +912,8 @@ static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, return ERR_PTR(-EOPNOTSUPP); } -static inline struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, - int i) +static inline void bpf_prog_add(struct bpf_prog *prog, int i) { - return ERR_PTR(-EOPNOTSUPP); } static inline void bpf_prog_sub(struct bpf_prog *prog, int i) @@ -926,9 +924,8 @@ static inline void bpf_prog_put(struct bpf_prog *prog) { } -static inline struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog) +static inline void bpf_prog_inc(struct bpf_prog *prog) { - return ERR_PTR(-EOPNOTSUPP); } static inline struct bpf_prog *__must_check -- cgit v1.2.3 From fc9702273e2edb90400a34b3be76f7b08fa3344b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:04 -0800 Subject: bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY Add ability to memory-map contents of BPF array map. This is extremely useful for working with BPF global data from userspace programs. It allows to avoid typical bpf_map_{lookup,update}_elem operations, improving both performance and usability. There had to be special considerations for map freezing, to avoid having writable memory view into a frozen map. To solve this issue, map freezing and mmap-ing is happening under mutex now: - if map is already frozen, no writable mapping is allowed; - if map has writable memory mappings active (accounted in map->writecnt), map freezing will keep failing with -EBUSY; - once number of writable memory mappings drops to zero, map freezing can be performed again. Only non-per-CPU plain arrays are supported right now. Maps with spinlocks can't be memory mapped either. For BPF_F_MMAPABLE array, memory allocation has to be done through vmalloc() to be mmap()'able. We also need to make sure that array data memory is page-sized and page-aligned, so we over-allocate memory in such a way that struct bpf_array is at the end of a single page of memory with array->value being aligned with the start of the second page. On deallocation we need to accomodate this memory arrangement to free vmalloc()'ed memory correctly. One important consideration regarding how memory-mapping subsystem functions. Memory-mapping subsystem provides few optional callbacks, among them open() and close(). close() is called for each memory region that is unmapped, so that users can decrease their reference counters and free up resources, if necessary. open() is *almost* symmetrical: it's called for each memory region that is being mapped, **except** the very first one. So bpf_map_mmap does initial refcnt bump, while open() will do any extra ones after that. Thus number of close() calls is equal to number of open() calls plus one more. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: John Fastabend Acked-by: Johannes Weiner Link: https://lore.kernel.org/bpf/20191117172806.2195367-4-andriin@fb.com --- include/linux/bpf.h | 11 ++++++++--- include/linux/vmalloc.h | 1 + include/uapi/linux/bpf.h | 3 +++ 3 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fb606dc61a3a..e913dd5946ae 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -68,6 +69,7 @@ struct bpf_map_ops { u64 *imm, u32 off); int (*map_direct_value_meta)(const struct bpf_map *map, u64 imm, u32 *off); + int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma); }; struct bpf_map_memory { @@ -96,9 +98,10 @@ struct bpf_map { u32 btf_value_type_id; struct btf *btf; struct bpf_map_memory memory; + char name[BPF_OBJ_NAME_LEN]; bool unpriv_array; - bool frozen; /* write-once */ - /* 48 bytes hole */ + bool frozen; /* write-once; write-protected by freeze_mutex */ + /* 22 bytes hole */ /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. @@ -106,7 +109,8 @@ struct bpf_map { atomic64_t refcnt ____cacheline_aligned; atomic64_t usercnt; struct work_struct work; - char name[BPF_OBJ_NAME_LEN]; + struct mutex freeze_mutex; + u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */ }; static inline bool map_value_has_spin_lock(const struct bpf_map *map) @@ -795,6 +799,7 @@ void bpf_map_charge_finish(struct bpf_map_memory *mem); void bpf_map_charge_move(struct bpf_map_memory *dst, struct bpf_map_memory *src); void *bpf_map_area_alloc(size_t size, int numa_node); +void *bpf_map_area_mmapable_alloc(size_t size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 4e7809408073..b4c58a191eb1 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -93,6 +93,7 @@ extern void *vzalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); extern void *vzalloc_node(unsigned long size, int node); +extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4842a134b202..dbbcf0b02970 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -348,6 +348,9 @@ enum bpf_attach_type { /* Clone map from listener for newly accepted socket */ #define BPF_F_CLONE (1U << 9) +/* Enable memory-mapping BPF map */ +#define BPF_F_MMAPABLE (1U << 10) + /* flags for BPF_PROG_QUERY */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) -- cgit v1.2.3 From 298e54fa810e027f1b0800d789eb862592721f08 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 15 Nov 2019 19:56:51 +0000 Subject: net: phy: add core phylib sfp support Add core phylib help for supporting SFP sockets on PHYs. This provides a mechanism to inform the SFP layer about PHY up/down events, and also unregister the SFP bus when the PHY is going away. Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 78436d58ce7c..124516fe2763 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -203,6 +203,8 @@ static inline const char *phy_modes(phy_interface_t interface) struct device; struct phylink; +struct sfp_bus; +struct sfp_upstream_ops; struct sk_buff; /* @@ -342,6 +344,8 @@ struct phy_c45_device_ids { * dev_flags: Device-specific flags used by the PHY driver. * irq: IRQ number of the PHY's interrupt (-1 if none) * phy_timer: The timer for handling the state machine + * sfp_bus_attached: flag indicating whether the SFP bus has been attached + * sfp_bus: SFP bus attached to this PHY's fiber port * attached_dev: The attached enet driver's device instance ptr * adjust_link: Callback for the enet controller to respond to * changes in the link state. @@ -432,6 +436,9 @@ struct phy_device { struct mutex lock; + /* This may be modified under the rtnl lock */ + bool sfp_bus_attached; + struct sfp_bus *sfp_bus; struct phylink *phylink; struct net_device *attached_dev; @@ -1020,6 +1027,10 @@ int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); int __phy_resume(struct phy_device *phydev); int phy_loopback(struct phy_device *phydev, bool enable); +void phy_sfp_attach(void *upstream, struct sfp_bus *bus); +void phy_sfp_detach(void *upstream, struct sfp_bus *bus); +int phy_sfp_probe(struct phy_device *phydev, + const struct sfp_upstream_ops *ops); struct phy_device *phy_attach(struct net_device *dev, const char *bus_id, phy_interface_t interface); struct phy_device *phy_find_first(struct mii_bus *bus); -- cgit v1.2.3 From 7c9e69428da39ed761c9d903c4850368fa4ef7bf Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sat, 16 Nov 2019 12:22:43 +0100 Subject: page_pool: add destroy attempts counter and rename tracepoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Jonathan change the page_pool to become responsible to its own shutdown via deferred work queue, then the disconnect_cnt counter was removed from xdp memory model tracepoint. This patch change the page_pool_inflight tracepoint name to page_pool_release, because it reflects the new responsability better. And it reintroduces a counter that reflect the number of times page_pool_release have been tried. The counter is also used by the code, to only empty the alloc cache once. With a stuck work queue running every second and counter being 64-bit, it will overrun in approx 584 billion years. For comparison, Earth lifetime expectancy is 7.5 billion years, before the Sun will engulf, and destroy, the Earth. Signed-off-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/net/page_pool.h | 2 ++ include/trace/events/page_pool.h | 9 ++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 1121faa99c12..ace881c15dcb 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -112,6 +112,8 @@ struct page_pool { * refcnt serves purpose is to simplify drivers error handling. */ refcount_t user_cnt; + + u64 destroy_cnt; }; struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h index 47b5ee880aa9..ee7f1aca7839 100644 --- a/include/trace/events/page_pool.h +++ b/include/trace/events/page_pool.h @@ -10,7 +10,7 @@ #include -TRACE_EVENT(page_pool_inflight, +TRACE_EVENT(page_pool_release, TP_PROTO(const struct page_pool *pool, s32 inflight, u32 hold, u32 release), @@ -22,6 +22,7 @@ TRACE_EVENT(page_pool_inflight, __field(s32, inflight) __field(u32, hold) __field(u32, release) + __field(u64, cnt) ), TP_fast_assign( @@ -29,10 +30,12 @@ TRACE_EVENT(page_pool_inflight, __entry->inflight = inflight; __entry->hold = hold; __entry->release = release; + __entry->cnt = pool->destroy_cnt; ), - TP_printk("page_pool=%p inflight=%d hold=%u release=%u", - __entry->pool, __entry->inflight, __entry->hold, __entry->release) + TP_printk("page_pool=%p inflight=%d hold=%u release=%u cnt=%llu", + __entry->pool, __entry->inflight, __entry->hold, + __entry->release, __entry->cnt) ); TRACE_EVENT(page_pool_state_release, -- cgit v1.2.3 From 832ccf6f80cda06ad2373cd1f40291b0183958b4 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sat, 16 Nov 2019 12:22:48 +0100 Subject: page_pool: extend tracepoint to also include the page PFN The MM tracepoint for page free (called kmem:mm_page_free) doesn't provide the page pointer directly, instead it provides the PFN (Page Frame Number). This is annoying when writing a page_pool leak detector in BPF. This patch change page_pool tracepoints to also provide the PFN. The page pointer is still provided to allow other kinds of troubleshooting from BPF. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/trace/events/page_pool.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h index ee7f1aca7839..2f2a10e8eb56 100644 --- a/include/trace/events/page_pool.h +++ b/include/trace/events/page_pool.h @@ -8,6 +8,7 @@ #include #include +#include #include TRACE_EVENT(page_pool_release, @@ -49,16 +50,18 @@ TRACE_EVENT(page_pool_state_release, __field(const struct page_pool *, pool) __field(const struct page *, page) __field(u32, release) + __field(unsigned long, pfn) ), TP_fast_assign( __entry->pool = pool; __entry->page = page; __entry->release = release; + __entry->pfn = page_to_pfn(page); ), - TP_printk("page_pool=%p page=%p release=%u", - __entry->pool, __entry->page, __entry->release) + TP_printk("page_pool=%p page=%p pfn=%lu release=%u", + __entry->pool, __entry->page, __entry->pfn, __entry->release) ); TRACE_EVENT(page_pool_state_hold, @@ -72,16 +75,18 @@ TRACE_EVENT(page_pool_state_hold, __field(const struct page_pool *, pool) __field(const struct page *, page) __field(u32, hold) + __field(unsigned long, pfn) ), TP_fast_assign( __entry->pool = pool; __entry->page = page; __entry->hold = hold; + __entry->pfn = page_to_pfn(page); ), - TP_printk("page_pool=%p page=%p hold=%u", - __entry->pool, __entry->page, __entry->hold) + TP_printk("page_pool=%p page=%p pfn=%lu hold=%u", + __entry->pool, __entry->page, __entry->pfn, __entry->hold) ); #endif /* _TRACE_PAGE_POOL_H */ -- cgit v1.2.3 From 7cd9a58d6860ae09acd7f0c219b5fa333703f72f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 19 Nov 2019 23:05:52 +0100 Subject: netfilter: nf_tables: constify nft_reg_load{8, 16, 64}() This patch constifies the pointer to source register data that is passed as an input parameter. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_tables.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 87b758407868..fe7c50acc681 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -114,7 +114,7 @@ static inline void nft_reg_store8(u32 *dreg, u8 val) *(u8 *)dreg = val; } -static inline u8 nft_reg_load8(u32 *sreg) +static inline u8 nft_reg_load8(const u32 *sreg) { return *(u8 *)sreg; } @@ -125,7 +125,7 @@ static inline void nft_reg_store16(u32 *dreg, u16 val) *(u16 *)dreg = val; } -static inline u16 nft_reg_load16(u32 *sreg) +static inline u16 nft_reg_load16(const u32 *sreg) { return *(u16 *)sreg; } @@ -135,7 +135,7 @@ static inline void nft_reg_store64(u32 *dreg, u64 val) put_unaligned(val, (u64 *)dreg); } -static inline u64 nft_reg_load64(u32 *sreg) +static inline u64 nft_reg_load64(const u32 *sreg) { return get_unaligned((u64 *)sreg); } -- cgit v1.2.3 From 8819efc9430142957c9c8fc7c09d9107e2061b87 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 19 Nov 2019 23:05:53 +0100 Subject: netfilter: nf_tables_offload: allow ethernet interface type only Hardware offload support at this stage assumes an ethernet device in place. The flow dissector provides the intermediate representation to express this selector, so extend it to allow to store the interface type. Flower does not uses this, so skb_flow_dissect_meta() is not extended to match on this new field. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index b1063db63e66..1a0727d1acfa 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -203,9 +203,11 @@ struct flow_dissector_key_ip { /** * struct flow_dissector_key_meta: * @ingress_ifindex: ingress ifindex + * @ingress_iftype: ingress interface type */ struct flow_dissector_key_meta { int ingress_ifindex; + u16 ingress_iftype; }; /** -- cgit v1.2.3 From a82055af595946aea461528e551e6ae064b3d560 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 19 Nov 2019 23:05:54 +0100 Subject: netfilter: nft_payload: add VLAN offload support Match on ethertype and set up protocol dependency. Check for protocol dependency before accessing the tci field. Allow to match on the encapsulated ethertype too. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 1a0727d1acfa..f06b0239c32b 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -48,9 +48,12 @@ struct flow_dissector_key_tags { }; struct flow_dissector_key_vlan { - u16 vlan_id:12, - vlan_dei:1, - vlan_priority:3; + union { + u16 vlan_id:12, + vlan_dei:1, + vlan_priority:3; + __be16 vlan_tci; + }; __be16 vlan_tpid; }; -- cgit v1.2.3 From bc836748707cf6b8b1a948b61149278f109107da Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Wed, 20 Nov 2019 00:15:17 +0000 Subject: page_pool: Add API to update numa node Add page_pool_update_nid() to be called by page pool consumers when they detect numa node changes. It will update the page pool nid value to start allocating from the new effective numa node. This is to mitigate page pool allocating pages from a wrong numa node, where the pool was originally allocated, and holding on to pages that belong to a different numa node, which causes performance degradation. For pages that are already being consumed and could be returned to the pool by the consumer, in next patch we will add a check per page to avoid recycling them back to the pool and return them to the page allocator. Signed-off-by: Saeed Mahameed Acked-by: Jonathan Lemon Reviewed-by: Ilias Apalodimas Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/page_pool.h | 7 +++++++ include/trace/events/page_pool.h | 22 ++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index ace881c15dcb..e2e1b7b1e8ba 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -204,4 +204,11 @@ static inline bool page_pool_put(struct page_pool *pool) return refcount_dec_and_test(&pool->user_cnt); } +/* Caller must provide appropriate safe context, e.g. NAPI. */ +void page_pool_update_nid(struct page_pool *pool, int new_nid); +static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) +{ + if (unlikely(pool->p.nid != new_nid)) + page_pool_update_nid(pool, new_nid); +} #endif /* _NET_PAGE_POOL_H */ diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h index 2f2a10e8eb56..ad0aa7f31675 100644 --- a/include/trace/events/page_pool.h +++ b/include/trace/events/page_pool.h @@ -89,6 +89,28 @@ TRACE_EVENT(page_pool_state_hold, __entry->pool, __entry->page, __entry->pfn, __entry->hold) ); +TRACE_EVENT(page_pool_update_nid, + + TP_PROTO(const struct page_pool *pool, int new_nid), + + TP_ARGS(pool, new_nid), + + TP_STRUCT__entry( + __field(const struct page_pool *, pool) + __field(int, pool_nid) + __field(int, new_nid) + ), + + TP_fast_assign( + __entry->pool = pool; + __entry->pool_nid = pool->p.nid; + __entry->new_nid = new_nid; + ), + + TP_printk("page_pool=%p pool_nid=%d new_nid=%d", + __entry->pool, __entry->pool_nid, __entry->new_nid) +); + #endif /* _TRACE_PAGE_POOL_H */ /* This part must be outside protection */ -- cgit v1.2.3 From cec2975f2b7058c42330a0f8164d94c6b7c8c446 Mon Sep 17 00:00:00 2001 From: Gautam Ramakrishnan Date: Wed, 20 Nov 2019 19:43:54 +0530 Subject: net: sched: pie: enable timestamp based delay calculation RFC 8033 suggests an alternative approach to calculate the queue delay in PIE by using a timestamp on every enqueued packet. This patch adds an implementation of that approach and sets it as the default method to calculate queue delay. The previous method (based on Little's law) to calculate queue delay is set as optional. Signed-off-by: Gautam Ramakrishnan Signed-off-by: Leslie Monis Signed-off-by: Mohit P. Tahiliani Acked-by: Dave Taht Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 5011259b8f67..9f1a72876212 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -950,19 +950,25 @@ enum { TCA_PIE_BETA, TCA_PIE_ECN, TCA_PIE_BYTEMODE, + TCA_PIE_DQ_RATE_ESTIMATOR, __TCA_PIE_MAX }; #define TCA_PIE_MAX (__TCA_PIE_MAX - 1) struct tc_pie_xstats { - __u64 prob; /* current probability */ - __u32 delay; /* current delay in ms */ - __u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */ - __u32 packets_in; /* total number of packets enqueued */ - __u32 dropped; /* packets dropped due to pie_action */ - __u32 overlimit; /* dropped due to lack of space in queue */ - __u32 maxq; /* maximum queue size */ - __u32 ecn_mark; /* packets marked with ecn*/ + __u64 prob; /* current probability */ + __u32 delay; /* current delay in ms */ + __u32 avg_dq_rate; /* current average dq_rate in + * bits/pie_time + */ + __u32 dq_rate_estimating; /* is avg_dq_rate being calculated? */ + __u32 packets_in; /* total number of packets enqueued */ + __u32 dropped; /* packets dropped due to pie_action */ + __u32 overlimit; /* dropped due to lack of space + * in queue + */ + __u32 maxq; /* maximum queue size */ + __u32 ecn_mark; /* packets marked with ecn*/ }; /* CBS */ -- cgit v1.2.3 From e68bc75691cc3de608c2c7505057c948d13ae587 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 20 Nov 2019 16:54:18 +0200 Subject: net: page_pool: add the possibility to sync DMA memory for device Introduce the following parameters in order to add the possibility to sync DMA memory for device before putting allocated pages in the page_pool caches: - PP_FLAG_DMA_SYNC_DEV: if set in page_pool_params flags, all pages that the driver gets from page_pool will be DMA-synced-for-device according to the length provided by the device driver. Please note DMA-sync-for-CPU is still device driver responsibility - offset: DMA address offset where the DMA engine starts copying rx data - max_len: maximum DMA memory size page_pool is allowed to flush. This is currently used in __page_pool_alloc_pages_slow routine when pages are allocated from page allocator These parameters are supposed to be set by device drivers. This optimization reduces the length of the DMA-sync-for-device. The optimization is valid because pages are initially DMA-synced-for-device as defined via max_len. At RX time, the driver will perform a DMA-sync-for-CPU on the memory for the packet length. What is important is the memory occupied by packet payload, because this is the area CPU is allowed to read and modify. As we don't track cache-lines written into by the CPU, simply use the packet payload length as dma_sync_size at page_pool recycle time. This also take into account any tail-extend. Tested-by: Matteo Croce Signed-off-by: Lorenzo Bianconi Signed-off-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/net/page_pool.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index e2e1b7b1e8ba..cfbed00ba7ee 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -34,8 +34,18 @@ #include #include -#define PP_FLAG_DMA_MAP 1 /* Should page_pool do the DMA map/unmap */ -#define PP_FLAG_ALL PP_FLAG_DMA_MAP +#define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA + * map/unmap + */ +#define PP_FLAG_DMA_SYNC_DEV BIT(1) /* If set all pages that the driver gets + * from page_pool will be + * DMA-synced-for-device according to + * the length provided by the device + * driver. + * Please note DMA-sync-for-CPU is still + * device driver responsibility + */ +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV) /* * Fast allocation side cache array/stack @@ -65,6 +75,8 @@ struct page_pool_params { int nid; /* Numa node id to allocate from pages from */ struct device *dev; /* device, for DMA pre-mapping purposes */ enum dma_data_direction dma_dir; /* DMA mapping direction */ + unsigned int max_len; /* max DMA sync memory size */ + unsigned int offset; /* DMA addr offset */ }; struct page_pool { @@ -151,8 +163,8 @@ static inline void page_pool_use_xdp_mem(struct page_pool *pool, #endif /* Never call this directly, use helpers below */ -void __page_pool_put_page(struct page_pool *pool, - struct page *page, bool allow_direct); +void __page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct); static inline void page_pool_put_page(struct page_pool *pool, struct page *page, bool allow_direct) @@ -161,14 +173,14 @@ static inline void page_pool_put_page(struct page_pool *pool, * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - __page_pool_put_page(pool, page, allow_direct); + __page_pool_put_page(pool, page, -1, allow_direct); #endif } /* Very limited use-cases allow recycle direct */ static inline void page_pool_recycle_direct(struct page_pool *pool, struct page *page) { - __page_pool_put_page(pool, page, true); + __page_pool_put_page(pool, page, -1, true); } /* Disconnects a page (from a page_pool). API users can have a need -- cgit v1.2.3 From 91e6015b082b08a74e5d9d326f651e5890a93519 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Nov 2019 22:38:16 +0100 Subject: bpf: Emit audit messages upon successful prog load and unload Allow for audit messages to be emitted upon BPF program load and unload for having a timeline of events. The load itself is in syscall context, so additional info about the process initiating the BPF prog creation can be logged and later directly correlated to the unload event. The only info really needed from BPF side is the globally unique prog ID where then audit user space tooling can query / dump all info needed about the specific BPF program right upon load event and enrich the record, thus these changes needed here can be kept small and non-intrusive to the core. Raw example output: # auditctl -D # auditctl -a always,exit -F arch=x86_64 -S bpf # ausearch --start recent -m 1334 [...] ---- time->Wed Nov 20 12:45:51 2019 type=PROCTITLE msg=audit(1574271951.590:8974): proctitle="./test_verifier" type=SYSCALL msg=audit(1574271951.590:8974): arch=c000003e syscall=321 success=yes exit=14 a0=5 a1=7ffe2d923e80 a2=78 a3=0 items=0 ppid=742 pid=949 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=2 comm="test_verifier" exe="/root/bpf-next/tools/testing/selftests/bpf/test_verifier" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1334] msg=audit(1574271951.590:8974): auid=0 uid=0 gid=0 ses=2 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 pid=949 comm="test_verifier" exe="/root/bpf-next/tools/testing/selftests/bpf/test_verifier" prog-id=3260 event=LOAD ---- time->Wed Nov 20 12:45:51 2019 type=UNKNOWN[1334] msg=audit(1574271951.590:8975): prog-id=3260 event=UNLOAD ---- [...] Signed-off-by: Daniel Borkmann Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191120213816.8186-1-jolsa@kernel.org --- include/linux/audit.h | 3 +++ include/uapi/linux/audit.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/audit.h b/include/linux/audit.h index aee3dc9eb378..edd006f4597d 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -159,6 +159,7 @@ extern void audit_log_key(struct audit_buffer *ab, extern void audit_log_link_denied(const char *operation); extern void audit_log_lost(const char *message); +extern void audit_log_task(struct audit_buffer *ab); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); @@ -219,6 +220,8 @@ static inline void audit_log_key(struct audit_buffer *ab, char *key) { } static inline void audit_log_link_denied(const char *string) { } +static inline void audit_log_task(struct audit_buffer *ab) +{ } static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index c89c6495983d..32a5db900f47 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -116,6 +116,7 @@ #define AUDIT_FANOTIFY 1331 /* Fanotify access decision */ #define AUDIT_TIME_INJOFFSET 1332 /* Timekeeping offset injected */ #define AUDIT_TIME_ADJNTPVAL 1333 /* NTP value adjustment */ +#define AUDIT_BPF 1334 /* BPF subsystem */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ -- cgit v1.2.3 From 196e8ca74886c433dcfc64a809707074b936aaf5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Nov 2019 23:04:44 +0100 Subject: bpf: Switch bpf_map_{area_alloc,area_mmapable_alloc}() to u64 size Given we recently extended the original bpf_map_area_alloc() helper in commit fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY"), we need to apply the same logic as in ff1c08e1f74b ("bpf: Change size to u64 for bpf_map_{area_alloc, charge_init}()"). To avoid conflicts, extend it for bpf-next. Reported-by: Stephen Rothwell Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e913dd5946ae..e89e86122233 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -794,12 +794,12 @@ void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); -int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size); +int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size); void bpf_map_charge_finish(struct bpf_map_memory *mem); void bpf_map_charge_move(struct bpf_map_memory *dst, struct bpf_map_memory *src); -void *bpf_map_area_alloc(size_t size, int numa_node); -void *bpf_map_area_mmapable_alloc(size_t size, int numa_node); +void *bpf_map_area_alloc(u64 size, int numa_node); +void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); -- cgit v1.2.3 From f3c9a666b28572b1a0ae691a47d9a7de4d9cefb3 Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 20 Nov 2019 12:29:59 +0000 Subject: net: sfp: soft status and control support Add support for the soft status and control register, which allows TX_FAULT and RX_LOS to be monitored and TX_DISABLE to be set. We make use of this when the board does not support GPIOs for these signals. Signed-off-by: Russell King Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/sfp.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 3b35efd85bb1..487fd9412d10 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -428,6 +428,10 @@ enum { SFP_TEC_CUR = 0x6c, SFP_STATUS = 0x6e, + SFP_STATUS_TX_DISABLE = BIT(7), + SFP_STATUS_TX_DISABLE_FORCE = BIT(6), + SFP_STATUS_TX_FAULT = BIT(2), + SFP_STATUS_RX_LOS = BIT(1), SFP_ALARM0 = 0x70, SFP_ALARM0_TEMP_HIGH = BIT(7), SFP_ALARM0_TEMP_LOW = BIT(6), -- cgit v1.2.3 From fca3f91cc38ad866c995fb099d961b31cd687849 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 21 Nov 2019 18:03:26 +0800 Subject: net: sched: add vxlan option support to act_tunnel_key This patch is to allow setting vxlan options using the act_tunnel_key action. Different from geneve options, only one option can be set. And also, geneve options and vxlan options can't be set at the same time. gbp is the only param for vxlan options: # ip link add name vxlan0 type vxlan dstport 0 external # tc qdisc add dev eth0 ingress # tc filter add dev eth0 protocol ip parent ffff: \ flower indev eth0 \ ip_proto udp \ action tunnel_key \ set src_ip 10.0.99.192 \ dst_ip 10.0.99.193 \ dst_port 6081 \ id 11 \ vxlan_opts 01020304 \ action mirred egress redirect dev vxlan0 v1->v2: - add .strict_start_type for enc_opts_policy as Jakub noticed. - use Duplicate instead of Wrong in err msg for extack as Jakub suggested. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_tunnel_key.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h index 41c8b462c177..f302c2a76953 100644 --- a/include/uapi/linux/tc_act/tc_tunnel_key.h +++ b/include/uapi/linux/tc_act/tc_tunnel_key.h @@ -50,6 +50,10 @@ enum { * TCA_TUNNEL_KEY_ENC_OPTS_ * attributes */ + TCA_TUNNEL_KEY_ENC_OPTS_VXLAN, /* Nested + * TCA_TUNNEL_KEY_ENC_OPTS_ + * attributes + */ __TCA_TUNNEL_KEY_ENC_OPTS_MAX, }; @@ -67,4 +71,13 @@ enum { #define TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX \ (__TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX - 1) +enum { + TCA_TUNNEL_KEY_ENC_OPT_VXLAN_UNSPEC, + TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, /* u32 */ + __TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX, +}; + +#define TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX \ + (__TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX - 1) + #endif -- cgit v1.2.3 From e20d4ff2acd7db2ffce64a6ddbdaeec43a8eec19 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 21 Nov 2019 18:03:27 +0800 Subject: net: sched: add erspan option support to act_tunnel_key This patch is to allow setting erspan options using the act_tunnel_key action. Different from geneve options, only one option can be set. And also, geneve options, vxlan options or erspan options can't be set at the same time. Options are expressed as ver:index:dir:hwid, when ver is set to 1, index will be applied while dir and hwid will be ignored, and when ver is set to 2, dir and hwid will be used while index will be ignored. # ip link add name erspan1 type erspan external # tc qdisc add dev eth0 ingress # tc filter add dev eth0 protocol ip parent ffff: \ flower indev eth0 \ ip_proto udp \ action tunnel_key \ set src_ip 10.0.99.192 \ dst_ip 10.0.99.193 \ dst_port 6081 \ id 11 \ erspan_opts 1:2:0:0 \ action mirred egress redirect dev erspan1 v1->v2: - do the validation when dst is not yet allocated as Jakub suggested. - use Duplicate instead of Wrong in err msg for extack. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_tunnel_key.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h index f302c2a76953..3f10dc4e7a4b 100644 --- a/include/uapi/linux/tc_act/tc_tunnel_key.h +++ b/include/uapi/linux/tc_act/tc_tunnel_key.h @@ -54,6 +54,10 @@ enum { * TCA_TUNNEL_KEY_ENC_OPTS_ * attributes */ + TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN, /* Nested + * TCA_TUNNEL_KEY_ENC_OPTS_ + * attributes + */ __TCA_TUNNEL_KEY_ENC_OPTS_MAX, }; @@ -80,4 +84,16 @@ enum { #define TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX \ (__TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX - 1) +enum { + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_UNSPEC, + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER, /* u8 */ + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX, /* be32 */ + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR, /* u8 */ + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID, /* u8 */ + __TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX, +}; + +#define TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX \ + (__TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX - 1) + #endif -- cgit v1.2.3 From d8f9dfae49ce4ffb772dc10dd6578dc815b34c12 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 21 Nov 2019 18:03:28 +0800 Subject: net: sched: allow flower to match vxlan options This patch is to allow matching gbp option in vxlan. The options can be described in the form GBP/GBP_MASK, where GBP is represented as a 32bit hexadecimal value. Different from geneve, only one option can be set. And also, geneve options and vxlan options can't be set at the same time. # ip link add name vxlan0 type vxlan dstport 0 external # tc qdisc add dev vxlan0 ingress # tc filter add dev vxlan0 protocol ip parent ffff: \ flower \ enc_src_ip 10.0.99.192 \ enc_dst_ip 10.0.99.193 \ enc_key_id 11 \ vxlan_opts 01020304/ffffffff \ ip_proto udp \ action mirred egress redirect dev eth0 v1->v2: - add .strict_start_type for enc_opts_policy as Jakub noticed. - use Duplicate instead of Wrong in err msg for extack as Jakub suggested. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index c6ad22f76ede..929825d710e2 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -571,6 +571,10 @@ enum { * TCA_FLOWER_KEY_ENC_OPT_GENEVE_ * attributes */ + TCA_FLOWER_KEY_ENC_OPTS_VXLAN, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_VXLAN_ + * attributes + */ __TCA_FLOWER_KEY_ENC_OPTS_MAX, }; @@ -588,6 +592,15 @@ enum { #define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \ (__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1) +enum { + TCA_FLOWER_KEY_ENC_OPT_VXLAN_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, /* u32 */ + __TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX - 1) + enum { TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), -- cgit v1.2.3 From 79b1011cb33d166f531a1347a17e6602954e4eb1 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 21 Nov 2019 18:03:29 +0800 Subject: net: sched: allow flower to match erspan options This patch is to allow matching options in erspan. The options can be described in the form: VER:INDEX:DIR:HWID/VER:INDEX_MASK:DIR_MASK:HWID_MASK. When ver is set to 1, index will be applied while dir and hwid will be ignored, and when ver is set to 2, dir and hwid will be used while index will be ignored. Different from geneve, only one option can be set. And also, geneve options, vxlan options or erspan options can't be set at the same time. # ip link add name erspan1 type erspan external # tc qdisc add dev erspan1 ingress # tc filter add dev erspan1 protocol ip parent ffff: \ flower \ enc_src_ip 10.0.99.192 \ enc_dst_ip 10.0.99.193 \ enc_key_id 11 \ erspan_opts 1:12:0:0/1:ffff:0:0 \ ip_proto udp \ action mirred egress redirect dev eth0 v1->v2: - improve some err msgs of extack. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 929825d710e2..449a63971451 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -575,6 +575,10 @@ enum { * TCA_FLOWER_KEY_ENC_OPT_VXLAN_ * attributes */ + TCA_FLOWER_KEY_ENC_OPTS_ERSPAN, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_ERSPAN_ + * attributes + */ __TCA_FLOWER_KEY_ENC_OPTS_MAX, }; @@ -601,6 +605,18 @@ enum { #define TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX \ (__TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX - 1) +enum { + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX, /* be32 */ + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID, /* u8 */ + __TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX - 1) + enum { TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), -- cgit v1.2.3 From 7599a896f2e46e9c072e02a8299a67d4d2f96675 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 21 Nov 2019 16:58:53 +0100 Subject: audit: Move audit_log_task declaration under CONFIG_AUDITSYSCALL The 0-DAY found that audit_log_task is not declared under CONFIG_AUDITSYSCALL which causes compilation error when it is not defined: kernel/bpf/syscall.o: In function `bpf_audit_prog.isra.30': >> syscall.c:(.text+0x860): undefined reference to `audit_log_task' Adding the audit_log_task declaration and stub within CONFIG_AUDITSYSCALL ifdef. Fixes: 91e6015b082b ("bpf: Emit audit messages upon successful prog load and unload") Reported-by: kbuild test robot Signed-off-by: Jiri Olsa Signed-off-by: David S. Miller --- include/linux/audit.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/audit.h b/include/linux/audit.h index edd006f4597d..18925d924c73 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -159,7 +159,6 @@ extern void audit_log_key(struct audit_buffer *ab, extern void audit_log_link_denied(const char *operation); extern void audit_log_lost(const char *message); -extern void audit_log_task(struct audit_buffer *ab); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); @@ -220,8 +219,6 @@ static inline void audit_log_key(struct audit_buffer *ab, char *key) { } static inline void audit_log_link_denied(const char *string) { } -static inline void audit_log_task(struct audit_buffer *ab) -{ } static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; @@ -361,6 +358,8 @@ static inline void audit_ptrace(struct task_struct *t) __audit_ptrace(t); } +extern void audit_log_task(struct audit_buffer *ab); + /* Private API (for audit.c only) */ extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); @@ -648,6 +647,9 @@ static inline void audit_ntp_log(const struct audit_ntp_data *ad) static inline void audit_ptrace(struct task_struct *t) { } + +static inline void audit_log_task(struct audit_buffer *ab) +{ } #define audit_n_rules 0 #define audit_signals 0 #endif /* CONFIG_AUDITSYSCALL */ -- cgit v1.2.3 From f145922ddcaa1cb9688b3d053622c98d9f9a7fff Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Wed, 20 Nov 2019 16:23:14 +0800 Subject: net: mscc: ocelot: export ocelot_hwstamp_get/set functions Export ocelot_hwstamp_get/set functions so that DSA driver is able to reuse them. Signed-off-by: Yangbo Lu Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index a836afe8f68e..2bac4bc34cf6 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -533,6 +533,8 @@ int ocelot_fdb_del(struct ocelot *ocelot, int port, int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid, bool untagged); int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid); +int ocelot_hwstamp_get(struct ocelot *ocelot, int port, struct ifreq *ifr); +int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr); int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts); void ocelot_get_hwtimestamp(struct ocelot *ocelot, struct timespec64 *ts); -- cgit v1.2.3 From e23a7b3e8daa4be3d91544d8ba210f96d2266de9 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Wed, 20 Nov 2019 16:23:15 +0800 Subject: net: mscc: ocelot: convert to use ocelot_get_txtstamp() The method getting TX timestamp by reading timestamp FIFO and matching skbs list is common for DSA Felix driver too. So move code out of ocelot_board.c, convert to use ocelot_get_txtstamp() function and export it. Signed-off-by: Yangbo Lu Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 2bac4bc34cf6..1a5cb1b2ac5d 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -406,6 +406,13 @@ struct ocelot_ops { int (*reset)(struct ocelot *ocelot); }; +struct ocelot_skb { + struct list_head head; + struct sk_buff *skb; + u8 id; +}; + + struct ocelot_port { struct ocelot *ocelot; @@ -536,6 +543,6 @@ int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid); int ocelot_hwstamp_get(struct ocelot *ocelot, int port, struct ifreq *ifr); int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr); int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts); -void ocelot_get_hwtimestamp(struct ocelot *ocelot, struct timespec64 *ts); +void ocelot_get_txtstamp(struct ocelot *ocelot); #endif -- cgit v1.2.3 From 400928bf928be153cddd76d9ac4e39978cb43fd3 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Wed, 20 Nov 2019 16:23:16 +0800 Subject: net: mscc: ocelot: convert to use ocelot_port_add_txtstamp_skb() Convert to use ocelot_port_add_txtstamp_skb() for adding skbs which require TX timestamp into list. Export it so that DSA Felix driver could reuse it too. Signed-off-by: Yangbo Lu Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 1a5cb1b2ac5d..e1108a5f4f17 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -543,6 +543,8 @@ int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid); int ocelot_hwstamp_get(struct ocelot *ocelot, int port, struct ifreq *ifr); int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr); int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts); +int ocelot_port_add_txtstamp_skb(struct ocelot_port *ocelot_port, + struct sk_buff *skb); void ocelot_get_txtstamp(struct ocelot *ocelot); #endif -- cgit v1.2.3 From 1f8ac5703037fdd2e6c960cd35c2b14d18ef3933 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 20 Nov 2019 13:47:33 +0100 Subject: ipv6: add fib6_has_custom_rules() helper It wraps the namespace field with the same name, to easily access it regardless of build options. Suggested-by: David Ahern Suggested-by: Eric Dumazet Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 5d1615463138..8ac3a59e5126 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -502,6 +502,11 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) } #ifdef CONFIG_IPV6_MULTIPLE_TABLES +static inline bool fib6_has_custom_rules(const struct net *net) +{ + return net->ipv6.fib6_has_custom_rules; +} + int fib6_rules_init(void); void fib6_rules_cleanup(void); bool fib6_rule_default(const struct fib_rule *rule); @@ -527,6 +532,10 @@ static inline bool fib6_rules_early_flow_dissect(struct net *net, return true; } #else +static inline bool fib6_has_custom_rules(const struct net *net) +{ + return false; +} static inline int fib6_rules_init(void) { return 0; -- cgit v1.2.3 From b9b33e7c24af1cddc7697056f1664279a40d9a4a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 20 Nov 2019 13:47:34 +0100 Subject: ipv6: keep track of routes using src Use a per namespace counter, increment it on successful creation of any route using the source address, decrement it on deletion of such routes. This allows us to check easily if the routing decision in the current namespace depends on the packet source. Will be used by the next patch. Suggested-by: David Ahern Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 30 ++++++++++++++++++++++++++++++ include/net/netns/ipv6.h | 3 +++ 2 files changed, 33 insertions(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 8ac3a59e5126..f1535f172935 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -90,7 +90,32 @@ struct fib6_gc_args { #ifndef CONFIG_IPV6_SUBTREES #define FIB6_SUBTREE(fn) NULL + +static inline bool fib6_routes_require_src(const struct net *net) +{ + return false; +} + +static inline void fib6_routes_require_src_inc(struct net *net) {} +static inline void fib6_routes_require_src_dec(struct net *net) {} + #else + +static inline bool fib6_routes_require_src(const struct net *net) +{ + return net->ipv6.fib6_routes_require_src > 0; +} + +static inline void fib6_routes_require_src_inc(struct net *net) +{ + net->ipv6.fib6_routes_require_src++; +} + +static inline void fib6_routes_require_src_dec(struct net *net) +{ + net->ipv6.fib6_routes_require_src--; +} + #define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1)) #endif @@ -212,6 +237,11 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) return ((struct rt6_info *)dst)->rt6i_idev; } +static inline bool fib6_requires_src(const struct fib6_info *rt) +{ + return rt->fib6_src.plen > 0; +} + static inline void fib6_clean_expires(struct fib6_info *f6i) { f6i->fib6_flags &= ~RTF_EXPIRES; diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 022a0fd1a5a4..5ec054473d81 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -83,6 +83,9 @@ struct netns_ipv6 { #ifdef CONFIG_IPV6_MULTIPLE_TABLES unsigned int fib6_rules_require_fldissect; bool fib6_has_custom_rules; +#ifdef CONFIG_IPV6_SUBTREES + unsigned int fib6_routes_require_src; +#endif struct rt6_info *ip6_prohibit_entry; struct rt6_info *ip6_blk_hole_entry; struct fib6_table *fib6_local_tbl; -- cgit v1.2.3 From c43c3d76c021d8d654ff5cfaad381f14f6beaf1a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 20 Nov 2019 13:47:36 +0100 Subject: ipv4: move fib4_has_custom_rules() helper to public header So that we can use it in the next patch. Additionally constify the helper argument. Suggested-by: David Ahern Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 52b2406a5dfc..b9cba41c6d4f 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -311,6 +311,11 @@ static inline int fib_lookup(struct net *net, const struct flowi4 *flp, return err; } +static inline bool fib4_has_custom_rules(const struct net *net) +{ + return false; +} + static inline bool fib4_rule_default(const struct fib_rule *rule) { return true; @@ -378,6 +383,11 @@ out: return err; } +static inline bool fib4_has_custom_rules(const struct net *net) +{ + return net->ipv4.fib_has_custom_rules; +} + bool fib4_rule_default(const struct fib_rule *rule); int fib4_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); -- cgit v1.2.3 From 02b24941619fcce3d280311ac73b1e461552e9c8 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 20 Nov 2019 13:47:37 +0100 Subject: ipv4: use dst hint for ipv4 list receive This is alike the previous change, with some additional ipv4 specific quirk. Even when using the route hint we still have to do perform additional per packet checks about source address validity: a new helper is added to wrap them. Hints are explicitly disabled if the destination is a local broadcast, that keeps the code simple and local broadcast are a slower path anyway. UDP flood performances vs recvmmsg() receiver: vanilla patched delta Kpps Kpps % 1683 1871 +11 In the worst case scenario - each packet has a different destination address - the performance delta is within noise range. v3 -> v4: - re-enable hints for forward v2 -> v3: - really fix build (sic) and hint usage check - use fib4_has_custom_rules() helpers (David A.) - add ip_extract_route_hint() helper (Edward C.) - use prev skb as hint instead of copying data (Willem) v1 -> v2: - fix build issue with !CONFIG_IP_MULTIPLE_TABLES Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/route.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/route.h b/include/net/route.h index 6c516840380d..a9c60fc68e36 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -185,6 +185,10 @@ int ip_route_input_rcu(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin, struct fib_result *res); +int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src, + u8 tos, struct net_device *devin, + const struct sk_buff *hint); + static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin) { -- cgit v1.2.3 From db3e1c40cf2f973fbdd52ae0b59a9472b1c04f4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 18 Nov 2019 22:06:08 -0800 Subject: mac80211: Import airtime calculation code from mt76 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Felix recently added code to calculate airtime of packets to the mt76 driver. Import this into mac80211 so we can use it for airtime queue limit calculations. The airtime.c file is copied verbatim from the mt76 driver, and adjusted to be usable in mac80211. This involves: - Switching to mac80211 data structures. - Adding support for 160 MHz channels and HE mode. - Moving the symbol and duration calculations around a bit to avoid rounding with the higher rates and longer symbol times used for HE rates. The per-rate TX rate calculation is also split out to its own function so it can be used directly for the AQL calculations later. Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20191119060610.76681-3-kyan@google.com [fix HE_GROUP_IDX() to use 3 * bw, since there are 3 _gi values] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c643a19dce96..6fc26a051ba0 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -6424,4 +6424,33 @@ void ieee80211_nan_func_match(struct ieee80211_vif *vif, struct cfg80211_nan_match_params *match, gfp_t gfp); +/** + * ieee80211_calc_rx_airtime - calculate estimated transmission airtime for RX. + * + * This function calculates the estimated airtime usage of a frame based on the + * rate information in the RX status struct and the frame length. + * + * @hw: pointer as obtained from ieee80211_alloc_hw() + * @status: &struct ieee80211_rx_status containing the transmission rate + * information. + * @len: frame length in bytes + */ +u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, + struct ieee80211_rx_status *status, + int len); + +/** + * ieee80211_calc_tx_airtime - calculate estimated transmission airtime for TX. + * + * This function calculates the estimated airtime usage of a frame based on the + * rate information in the TX info struct and the frame length. + * + * @hw: pointer as obtained from ieee80211_alloc_hw() + * @info: &struct ieee80211_tx_info of the frame. + * @len: frame length in bytes + */ +u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw, + struct ieee80211_tx_info *info, + int len); + #endif /* MAC80211_H */ -- cgit v1.2.3 From 3ace10f5b5ad94bdbd4b419dc9da2217d57720a9 Mon Sep 17 00:00:00 2001 From: Kan Yan Date: Mon, 18 Nov 2019 22:06:09 -0800 Subject: mac80211: Implement Airtime-based Queue Limit (AQL) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order for the Fq_CoDel algorithm integrated in mac80211 layer to operate effectively to control excessive queueing latency, the CoDel algorithm requires an accurate measure of how long packets stays in the queue, AKA sojourn time. The sojourn time measured at the mac80211 layer doesn't include queueing latency in the lower layer (firmware/hardware) and CoDel expects lower layer to have a short queue. However, most 802.11ac chipsets offload tasks such TX aggregation to firmware or hardware, thus have a deep lower layer queue. Without a mechanism to control the lower layer queue size, packets only stay in mac80211 layer transiently before being sent to firmware queue. As a result, the sojourn time measured by CoDel in the mac80211 layer is almost always lower than the CoDel latency target, hence CoDel does little to control the latency, even when the lower layer queue causes excessive latency. The Byte Queue Limits (BQL) mechanism is commonly used to address the similar issue with wired network interface. However, this method cannot be applied directly to the wireless network interface. "Bytes" is not a suitable measure of queue depth in the wireless network, as the data rate can vary dramatically from station to station in the same network, from a few Mbps to over Gbps. This patch implements an Airtime-based Queue Limit (AQL) to make CoDel work effectively with wireless drivers that utilized firmware/hardware offloading. AQL allows each txq to release just enough packets to the lower layer to form 1-2 large aggregations to keep hardware fully utilized and retains the rest of the frames in mac80211 layer to be controlled by the CoDel algorithm. Signed-off-by: Kan Yan [ Toke: Keep API to set pending airtime internal, fix nits in commit msg ] Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20191119060610.76681-4-kyan@google.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 7 +++++++ include/net/mac80211.h | 12 ++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5ded77fad7fb..059524b87c4c 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2606,6 +2606,13 @@ enum wiphy_params_flags { #define IEEE80211_DEFAULT_AIRTIME_WEIGHT 256 +/* The per TXQ device queue limit in airtime */ +#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L 5000 +#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H 12000 + +/* The per interface airtime threshold to switch to lower queue limit */ +#define IEEE80211_AQL_THRESHOLD 24000 + /** * struct cfg80211_pmksa - PMK Security Association * diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 6fc26a051ba0..ba3f33cc41ea 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5565,6 +5565,18 @@ void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid); void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, u32 tx_airtime, u32 rx_airtime); +/** + * ieee80211_txq_airtime_check - check if a txq can send frame to device + * + * @hw: pointer obtained from ieee80211_alloc_hw() + * @txq: pointer obtained from station or virtual interface + * + * Return true if the AQL's airtime limit has not been reached and the txq can + * continue to send more packets to the device. Otherwise return false. + */ +bool +ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq); + /** * ieee80211_iter_keys - iterate keys programmed into the device * @hw: pointer obtained from ieee80211_alloc_hw() -- cgit v1.2.3 From 7a89233ac50468a3a9636803a85d06c8f907f8ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 18 Nov 2019 22:06:10 -0800 Subject: mac80211: Use Airtime-based Queue Limits (AQL) on packet dequeue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous commit added the ability to throttle stations when they queue too much airtime in the hardware. This commit enables the functionality by calculating the expected airtime usage of each packet that is dequeued from the TXQs in mac80211, and accounting that as pending airtime. The estimated airtime for each skb is stored in the tx_info, so we can subtract the same amount from the running total when the skb is freed or recycled. The throttling mechanism relies on this accounting to be accurate (i.e., that we are not freeing skbs without subtracting any airtime they were accounted for), so we put the subtraction into ieee80211_report_used_skb(). As an optimisation, we also subtract the airtime on regular TX completion, zeroing out the value stored in the packet afterwards, to avoid having to do an expensive lookup of the station from the packet data on every packet. This patch does *not* include any mechanism to wake a throttled TXQ again, on the assumption that this will happen anyway as a side effect of whatever freed the skb (most commonly a TX completion). Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20191119060610.76681-5-kyan@google.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index ba3f33cc41ea..aa145808e57a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1060,6 +1060,22 @@ struct ieee80211_tx_info { }; }; +static inline u16 +ieee80211_info_set_tx_time_est(struct ieee80211_tx_info *info, u16 tx_time_est) +{ + /* We only have 10 bits in tx_time_est, so store airtime + * in increments of 4us and clamp the maximum to 2**12-1 + */ + info->tx_time_est = min_t(u16, tx_time_est, 4095) >> 2; + return info->tx_time_est << 2; +} + +static inline u16 +ieee80211_info_get_tx_time_est(struct ieee80211_tx_info *info) +{ + return info->tx_time_est << 2; +} + /** * struct ieee80211_tx_status - extended tx status info for rate control * -- cgit v1.2.3 From d1746d1e80a86ca86b0c2680510898d411d2ef47 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 22 Nov 2019 15:47:21 +0000 Subject: net: flow_dissector: Wrap unionized VLAN fields in a struct In commit a82055af5959 ("netfilter: nft_payload: add VLAN offload support"), VLAN fields in struct flow_dissector_key_vlan were unionized with the intention of introducing another field that covered the whole TCI header. However without a wrapping struct the subfields end up sharing the same bits. As a result, "tc filter add ... flower vlan_id 14" specifies not only vlan_id, but also vlan_priority. Fix by wrapping the individual VLAN fields in a struct. Fixes: a82055af5959 ("netfilter: nft_payload: add VLAN offload support") Signed-off-by: Petr Machata Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index f06b0239c32b..b8c20e9f343e 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -49,9 +49,11 @@ struct flow_dissector_key_tags { struct flow_dissector_key_vlan { union { - u16 vlan_id:12, - vlan_dei:1, - vlan_priority:3; + struct { + u16 vlan_id:12, + vlan_dei:1, + vlan_priority:3; + }; __be16 vlan_tci; }; __be16 vlan_tpid; -- cgit v1.2.3 From a18fab48dbacbb7ff104a13e987778b7995bec07 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Mon, 28 Oct 2019 16:58:53 +0200 Subject: net/mlx5: DR, Add HW bits and definitions for Geneve flex parser Add definition for flex parser tunneling header for Geneve. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4f912d4e67bc..5d54fccf87fc 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1110,6 +1110,7 @@ enum { }; enum { + MLX5_FLEX_PARSER_GENEVE_ENABLED = 1 << 3, MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED = 1 << 7, MLX5_FLEX_PARSER_ICMP_V4_ENABLED = 1 << 8, MLX5_FLEX_PARSER_ICMP_V6_ENABLED = 1 << 9, -- cgit v1.2.3 From 30429fba99b51836ea8a11174be95ddaa8c47703 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Fri, 22 Nov 2019 13:50:52 -0800 Subject: net: inet_is_local_reserved_port() should return bool not int MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Eric Dumazet Signed-off-by: Maciej Żenczykowski Signed-off-by: Jakub Kicinski --- include/net/ip.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index a2c61c36dc4a..cebf3e10def1 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -339,10 +339,10 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o void inet_get_local_port_range(struct net *net, int *low, int *high); #ifdef CONFIG_SYSCTL -static inline int inet_is_local_reserved_port(struct net *net, int port) +static inline bool inet_is_local_reserved_port(struct net *net, int port) { if (!net->ipv4.sysctl_local_reserved_ports) - return 0; + return false; return test_bit(port, net->ipv4.sysctl_local_reserved_ports); } @@ -357,9 +357,9 @@ static inline int inet_prot_sock(struct net *net) } #else -static inline int inet_is_local_reserved_port(struct net *net, int port) +static inline bool inet_is_local_reserved_port(struct net *net, int port) { - return 0; + return false; } static inline int inet_prot_sock(struct net *net) -- cgit v1.2.3 From 84bb46cd62283cc371769ec1f77ff7924099f584 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 23 Nov 2019 09:54:58 -0800 Subject: Revert "bpf: Emit audit messages upon successful prog load and unload" This commit reverts commit 91e6015b082b ("bpf: Emit audit messages upon successful prog load and unload") and its follow up commit 7599a896f2e4 ("audit: Move audit_log_task declaration under CONFIG_AUDITSYSCALL") as requested by Paul Moore. The change needs close review on linux-audit, tests etc. Signed-off-by: Jakub Kicinski --- include/linux/audit.h | 5 ----- include/uapi/linux/audit.h | 1 - 2 files changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/audit.h b/include/linux/audit.h index 18925d924c73..aee3dc9eb378 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -358,8 +358,6 @@ static inline void audit_ptrace(struct task_struct *t) __audit_ptrace(t); } -extern void audit_log_task(struct audit_buffer *ab); - /* Private API (for audit.c only) */ extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); @@ -647,9 +645,6 @@ static inline void audit_ntp_log(const struct audit_ntp_data *ad) static inline void audit_ptrace(struct task_struct *t) { } - -static inline void audit_log_task(struct audit_buffer *ab) -{ } #define audit_n_rules 0 #define audit_signals 0 #endif /* CONFIG_AUDITSYSCALL */ diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 32a5db900f47..c89c6495983d 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -116,7 +116,6 @@ #define AUDIT_FANOTIFY 1331 /* Fanotify access decision */ #define AUDIT_TIME_INJOFFSET 1332 /* Timekeeping offset injected */ #define AUDIT_TIME_ADJNTPVAL 1333 /* NTP value adjustment */ -#define AUDIT_BPF 1334 /* BPF subsystem */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ -- cgit v1.2.3 From e3cf8b3668a808c1d252269ffc34a5723cfb9a7b Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 22 Nov 2019 12:37:08 +0000 Subject: net: phy: remove phy_ethtool_sset() There are no users of phy_ethtool_sset() in the kernel anymore, and as of commit 3c1bcc8614db ("net: ethernet: Convert phydev advertize and supported from u32 to link mode"), the implementation is slightly buggy - it doesn't correctly check the masked advertising mask as it used to. Remove it, and update the phy documentation to refer to its replacement function. Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 124516fe2763..f5cdfb206097 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1160,7 +1160,6 @@ void phy_queue_state_machine(struct phy_device *phydev, unsigned long jiffies); void phy_mac_interrupt(struct phy_device *phydev); void phy_start_machine(struct phy_device *phydev); void phy_stop_machine(struct phy_device *phydev); -int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd); void phy_ethtool_ksettings_get(struct phy_device *phydev, struct ethtool_link_ksettings *cmd); int phy_ethtool_ksettings_set(struct phy_device *phydev, -- cgit v1.2.3 From d46b7e4fb06037a61415f5b6964fcf632ee1dc34 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 21 Nov 2019 00:36:22 +0000 Subject: net: phylink: rename mac_link_state() op to mac_pcs_get_state() Rename the mac_link_state() method to mac_pcs_get_state() to make it clear that it should be returning the MACs PCS current state, which is used for inband negotiation rather than just reading back what the MAC has been configured for. Update the documentation to explicitly mention that this is for inband. We drop the return value as well; most of phylink doesn't check the return value and it is not clear what it should do on error - instead arrange for state->link to be false. Signed-off-by: Russell King Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 300ecdb6790a..fed5488e3c75 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -72,7 +72,7 @@ struct phylink_config { /** * struct phylink_mac_ops - MAC operations structure. * @validate: Validate and update the link configuration. - * @mac_link_state: Read the current link state from the hardware. + * @mac_pcs_get_state: Read the current link state from the hardware. * @mac_config: configure the MAC for the selected mode and state. * @mac_an_restart: restart 802.3z BaseX autonegotiation. * @mac_link_down: take the link down. @@ -84,8 +84,8 @@ struct phylink_mac_ops { void (*validate)(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); - int (*mac_link_state)(struct phylink_config *config, - struct phylink_link_state *state); + void (*mac_pcs_get_state)(struct phylink_config *config, + struct phylink_link_state *state); void (*mac_config)(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state); void (*mac_an_restart)(struct phylink_config *config); @@ -127,18 +127,19 @@ void validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); /** - * mac_link_state() - Read the current link state from the hardware + * mac_pcs_get_state() - Read the current inband link state from the hardware * @config: a pointer to a &struct phylink_config. * @state: a pointer to a &struct phylink_link_state. * - * Read the current link state from the MAC, reporting the current - * speed in @state->speed, duplex mode in @state->duplex, pause mode - * in @state->pause using the %MLO_PAUSE_RX and %MLO_PAUSE_TX bits, - * negotiation completion state in @state->an_complete, and link - * up state in @state->link. + * Read the current inband link state from the MAC PCS, reporting the + * current speed in @state->speed, duplex mode in @state->duplex, pause + * mode in @state->pause using the %MLO_PAUSE_RX and %MLO_PAUSE_TX bits, + * negotiation completion state in @state->an_complete, and link up state + * in @state->link. If possible, @state->lp_advertising should also be + * populated. */ -int mac_link_state(struct phylink_config *config, - struct phylink_link_state *state); +void mac_pcs_get_state(struct phylink_config *config, + struct phylink_link_state *state); /** * mac_config() - configure the MAC for the selected mode and state @@ -166,7 +167,7 @@ int mac_link_state(struct phylink_config *config, * 1000base-X or Cisco SGMII mode depending on the @state->interface * mode). In both cases, link state management (whether the link * is up or not) is performed by the MAC, and reported via the - * mac_link_state() callback. Changes in link state must be made + * mac_pcs_get_state() callback. Changes in link state must be made * by calling phylink_mac_change(). * * If in 802.3z mode, the link speed is fixed, dependent on the -- cgit v1.2.3 From 312434617cb16be5166316cf9d08ba760b1042a1 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 23 Nov 2019 11:56:49 +0800 Subject: sctp: cache netns in sctp_ep_common This patch is to fix a data-race reported by syzbot: BUG: KCSAN: data-race in sctp_assoc_migrate / sctp_hash_obj write to 0xffff8880b67c0020 of 8 bytes by task 18908 on cpu 1: sctp_assoc_migrate+0x1a6/0x290 net/sctp/associola.c:1091 sctp_sock_migrate+0x8aa/0x9b0 net/sctp/socket.c:9465 sctp_accept+0x3c8/0x470 net/sctp/socket.c:4916 inet_accept+0x7f/0x360 net/ipv4/af_inet.c:734 __sys_accept4+0x224/0x430 net/socket.c:1754 __do_sys_accept net/socket.c:1795 [inline] __se_sys_accept net/socket.c:1792 [inline] __x64_sys_accept+0x4e/0x60 net/socket.c:1792 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 read to 0xffff8880b67c0020 of 8 bytes by task 12003 on cpu 0: sctp_hash_obj+0x4f/0x2d0 net/sctp/input.c:894 rht_key_get_hash include/linux/rhashtable.h:133 [inline] rht_key_hashfn include/linux/rhashtable.h:159 [inline] rht_head_hashfn include/linux/rhashtable.h:174 [inline] head_hashfn lib/rhashtable.c:41 [inline] rhashtable_rehash_one lib/rhashtable.c:245 [inline] rhashtable_rehash_chain lib/rhashtable.c:276 [inline] rhashtable_rehash_table lib/rhashtable.c:316 [inline] rht_deferred_worker+0x468/0xab0 lib/rhashtable.c:420 process_one_work+0x3d4/0x890 kernel/workqueue.c:2269 worker_thread+0xa0/0x800 kernel/workqueue.c:2415 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 It was caused by rhashtable access asoc->base.sk when sctp_assoc_migrate is changing its value. However, what rhashtable wants is netns from asoc base.sk, and for an asoc, its netns won't change once set. So we can simply fix it by caching netns since created. Fixes: d6c0256a60e6 ("sctp: add the rhashtable apis for sctp global transport hashtable") Reported-by: syzbot+e3b35fe7918ff0ee474e@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: Jakub Kicinski --- include/net/sctp/structs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 503fbc3cd819..2b6f3f13d5bc 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1239,6 +1239,9 @@ struct sctp_ep_common { /* What socket does this endpoint belong to? */ struct sock *sk; + /* Cache netns and it won't change once set */ + struct net *net; + /* This is where we receive inbound chunks. */ struct sctp_inq inqueue; -- cgit v1.2.3 From 4b3da77b72ad6b3c48c6fe4a395ace7db39a12c5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:54 +0100 Subject: bpf, x86: Generalize and extend bpf_arch_text_poke for direct jumps Add BPF_MOD_{NOP_TO_JUMP,JUMP_TO_JUMP,JUMP_TO_NOP} patching for x86 JIT in order to be able to patch direct jumps or nop them out. We need this facility in order to patch tail call jumps and in later work also BPF static keys. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/aa4784196a8e5e985af4b30a4fe5336bce6e9643.1574452833.git.daniel@iogearbox.net --- include/linux/bpf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e89e86122233..7978b617caa8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1284,10 +1284,16 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, #endif /* CONFIG_INET */ enum bpf_text_poke_type { + /* All call-related pokes. */ BPF_MOD_NOP_TO_CALL, BPF_MOD_CALL_TO_CALL, BPF_MOD_CALL_TO_NOP, + /* All jump-related pokes. */ + BPF_MOD_NOP_TO_JUMP, + BPF_MOD_JUMP_TO_JUMP, + BPF_MOD_JUMP_TO_NOP, }; + int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); -- cgit v1.2.3 From 6332be04c039a72fca32ed0a4265bac58d606bb6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:55 +0100 Subject: bpf: Move bpf_free_used_maps into sleepable section We later on are going to need a sleepable context as opposed to plain RCU callback in order to untrack programs we need to poke at runtime and tracking as well as image update is performed under mutex. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/09823b1d5262876e9b83a8e75df04cf0467357a4.1574452833.git.daniel@iogearbox.net --- include/linux/bpf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7978b617caa8..561b920f0bf7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1031,6 +1031,10 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, { return -ENOTSUPP; } + +static inline void bpf_map_put(struct bpf_map *map) +{ +} #endif /* CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, -- cgit v1.2.3 From 2beee5f57441413b64a9c2bd657e17beabb98d1c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:56 +0100 Subject: bpf: Move owner type, jited info into array auxiliary data We're going to extend this with further information which is only relevant for prog array at this point. Given this info is not used in critical path, move it into its own structure such that the main array map structure can be kept on diet. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/b9ddccdb0f6f7026489ee955f16c96381e1e7238.1574452833.git.daniel@iogearbox.net --- include/linux/bpf.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 561b920f0bf7..c3b29061284e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -560,17 +560,21 @@ struct bpf_prog_aux { }; }; +struct bpf_array_aux { + /* 'Ownership' of prog array is claimed by the first program that + * is going to use this map or by the first program which FD is + * stored in the map to make sure that all callers and callees have + * the same prog type and JITed flag. + */ + enum bpf_prog_type type; + bool jited; +}; + struct bpf_array { struct bpf_map map; u32 elem_size; u32 index_mask; - /* 'ownership' of prog_array is claimed by the first program that - * is going to use this map or by the first program which FD is stored - * in the map to make sure that all callers and callees have the same - * prog_type and JITed flag - */ - enum bpf_prog_type owner_prog_type; - bool owner_jited; + struct bpf_array_aux *aux; union { char value[0] __aligned(8); void *ptrs[0] __aligned(8); -- cgit v1.2.3 From a66886fe6c24ebeeb6dc10fbd9b75158029eacf7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:57 +0100 Subject: bpf: Add initial poke descriptor table for jit images Add initial poke table data structures and management to the BPF prog that can later be used by JITs. Also add an instance of poke specific data for tail call maps; plan for later work is to extend this also for BPF static keys. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1db285ec2ea4207ee0455b3f8e191a4fc58b9ade.1574452833.git.daniel@iogearbox.net --- include/linux/bpf.h | 20 ++++++++++++++++++++ include/linux/filter.h | 10 ++++++++++ 2 files changed, 30 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c3b29061284e..312983bf7faa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -488,6 +488,24 @@ struct bpf_func_info_aux { bool unreliable; }; +enum bpf_jit_poke_reason { + BPF_POKE_REASON_TAIL_CALL, +}; + +/* Descriptor of pokes pointing /into/ the JITed image. */ +struct bpf_jit_poke_descriptor { + void *ip; + union { + struct { + struct bpf_map *map; + u32 key; + } tail_call; + }; + bool ip_stable; + u8 adj_off; + u16 reason; +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -513,6 +531,8 @@ struct bpf_prog_aux { const char *attach_func_name; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ + struct bpf_jit_poke_descriptor *poke_tab; + u32 size_poke_tab; struct latch_tree_node ksym_tnode; struct list_head ksym_lnode; const struct bpf_prog_ops *ops; diff --git a/include/linux/filter.h b/include/linux/filter.h index ad80e9c6111c..796b60d8cc6c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -952,6 +952,9 @@ void *bpf_jit_alloc_exec(unsigned long size); void bpf_jit_free_exec(void *addr); void bpf_jit_free(struct bpf_prog *fp); +int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, + struct bpf_jit_poke_descriptor *poke); + int bpf_jit_get_func_addr(const struct bpf_prog *prog, const struct bpf_insn *insn, bool extra_pass, u64 *func_addr, bool *func_addr_fixed); @@ -1055,6 +1058,13 @@ static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) return false; } +static inline int +bpf_jit_add_poke_descriptor(struct bpf_prog *prog, + struct bpf_jit_poke_descriptor *poke) +{ + return -ENOTSUPP; +} + static inline void bpf_jit_free(struct bpf_prog *fp) { bpf_prog_unlock_free(fp); -- cgit v1.2.3 From da765a2f599304a81a25e77908d1790414ecdbb6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:58 +0100 Subject: bpf: Add poke dependency tracking for prog array maps This work adds program tracking to prog array maps. This is needed such that upon prog array updates/deletions we can fix up all programs which make use of this tail call map. We add ops->map_poke_{un,}track() helpers to maps to maintain the list of programs and ops->map_poke_run() for triggering the actual update. bpf_array_aux is extended to contain the list head and poke_mutex in order to serialize program patching during updates/deletions. bpf_free_used_maps() will untrack the program shortly before dropping the reference to the map. For clearing out the prog array once all urefs are dropped we need to use schedule_work() to have a sleepable context. The prog_array_map_poke_run() is triggered during updates/deletions and walks the maintained prog list. It checks in their poke_tabs whether the map and key is matching and runs the actual bpf_arch_text_poke() for patching in the nop or new jmp location. Depending on the type of update, we use one of BPF_MOD_{NOP_TO_JUMP,JUMP_TO_NOP,JUMP_TO_JUMP}. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1fb364bb3c565b3e415d5ea348f036ff379e779d.1574452833.git.daniel@iogearbox.net --- include/linux/bpf.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 312983bf7faa..c2f07fd410c1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -22,6 +22,7 @@ struct bpf_verifier_env; struct bpf_verifier_log; struct perf_event; struct bpf_prog; +struct bpf_prog_aux; struct bpf_map; struct sock; struct seq_file; @@ -64,6 +65,12 @@ struct bpf_map_ops { const struct btf_type *key_type, const struct btf_type *value_type); + /* Prog poke tracking helpers. */ + int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux); + void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux); + void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old, + struct bpf_prog *new); + /* Direct value access helpers. */ int (*map_direct_value_addr)(const struct bpf_map *map, u64 *imm, u32 off); @@ -588,6 +595,11 @@ struct bpf_array_aux { */ enum bpf_prog_type type; bool jited; + /* Programs with direct jumps into programs part of this array. */ + struct list_head poke_progs; + struct bpf_map *map; + struct mutex poke_mutex; + struct work_struct work; }; struct bpf_array { -- cgit v1.2.3 From d2e4c1e6c2947269346054ac8937ccfe9e0bcc6b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:59 +0100 Subject: bpf: Constant map key tracking for prog array pokes Add tracking of constant keys into tail call maps. The signature of bpf_tail_call_proto is that arg1 is ctx, arg2 map pointer and arg3 is a index key. The direct call approach for tail calls can be enabled if the verifier asserted that for all branches leading to the tail call helper invocation, the map pointer and index key were both constant and the same. Tracking of map pointers we already do from prior work via c93552c443eb ("bpf: properly enforce index mask to prevent out-of-bounds speculation") and 09772d92cd5a ("bpf: avoid retpoline for lookup/update/ delete calls on maps"). Given the tail call map index key is not on stack but directly in the register, we can add similar tracking approach and later in fixup_bpf_calls() add a poke descriptor to the progs poke_tab with the relevant information for the JITing phase. We internally reuse insn->imm for the rewritten BPF_JMP | BPF_TAIL_CALL instruction in order to point into the prog's poke_tab, and keep insn->imm as 0 as indicator that current indirect tail call emission must be used. Note that publishing to the tracker must happen at the end of fixup_bpf_calls() since adding elements to the poke_tab reallocates its memory, so we need to wait until its in final state. Future work can generalize and add similar approach to optimize plain array map lookups. Difference there is that we need to look into the key value that sits on stack. For clarity in bpf_insn_aux_data, map_state has been renamed into map_ptr_state, so we get map_{ptr,key}_state as trackers. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/e8db37f6b2ae60402fa40216c96738ee9b316c32.1574452833.git.daniel@iogearbox.net --- include/linux/bpf_verifier.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index cdd08bf0ec06..26e40de9ef55 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -293,7 +293,7 @@ struct bpf_verifier_state_list { struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ - unsigned long map_state; /* pointer/poison value for maps */ + unsigned long map_ptr_state; /* pointer/poison value for maps */ s32 call_imm; /* saved imm field of call insn */ u32 alu_limit; /* limit for add/sub register with pointer */ struct { @@ -301,6 +301,7 @@ struct bpf_insn_aux_data { u32 map_off; /* offset from value base address */ }; }; + u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ int sanitize_stack_off; /* stack slot to be cleared */ bool seen; /* this insn was processed by the verifier */ -- cgit v1.2.3 From b8cd76ca4ae34731d47cd6a876d912a08efcc240 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 23 Nov 2019 21:37:31 +0100 Subject: bpf: Add bpf_jit_blinding_enabled for !CONFIG_BPF_JIT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a definition of bpf_jit_blinding_enabled() when CONFIG_BPF_JIT is not set in order to fix a recent build regression: [...] CC kernel/bpf/verifier.o CC kernel/bpf/inode.o kernel/bpf/verifier.c: In function ‘fixup_bpf_calls’: kernel/bpf/verifier.c:9132:25: error: implicit declaration of function ‘bpf_jit_blinding_enabled’; did you mean ‘bpf_jit_kallsyms_enabled’? [-Werror=implicit-function-declaration] 9132 | bool expect_blinding = bpf_jit_blinding_enabled(prog); | ^~~~~~~~~~~~~~~~~~~~~~~~ | bpf_jit_kallsyms_enabled CC kernel/bpf/helpers.o CC kernel/bpf/hashtab.o [...] Fixes: d2e4c1e6c294 ("bpf: Constant map key tracking for prog array pokes") Reported-by: Jakub Sitnicki Reported-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/40baf8f3507cac4851a310578edfb98ce73b5605.1574541375.git.daniel@iogearbox.net --- include/linux/filter.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 796b60d8cc6c..1b1e8b8f88da 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1053,6 +1053,11 @@ static inline bool ebpf_jit_enabled(void) return false; } +static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) +{ + return false; +} + static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) { return false; -- cgit v1.2.3 From b553a6ec570044fc1ae300c6fb24f9ce204c5894 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 24 Nov 2019 01:39:42 +0100 Subject: bpf: Simplify __bpf_arch_text_poke poke type handling Given that we have BPF_MOD_NOP_TO_{CALL,JUMP}, BPF_MOD_{CALL,JUMP}_TO_NOP and BPF_MOD_{CALL,JUMP}_TO_{CALL,JUMP} poke types and that we also pass in old_addr as well as new_addr, it's a bit redundant and unnecessarily complicates __bpf_arch_text_poke() itself since we can derive the same from the *_addr that were passed in. Hence simplify and use BPF_MOD_{CALL,JUMP} as types which also allows to clean up call-sites. In addition to that, __bpf_arch_text_poke() currently verifies that text matches expected old_insn before we invoke text_poke_bp(). Also add a check on new_insn and skip rewrite if it already matches. Reason why this is rather useful is that it avoids making any special casing in prog_array_map_poke_run() when old and new prog were NULL and has the benefit that also for this case we perform a check on text whether it really matches our expectations. Suggested-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/fcb00a2b0b288d6c73de4ef58116a821c8fe8f2f.1574555798.git.daniel@iogearbox.net --- include/linux/bpf.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c2f07fd410c1..35903f148be5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1324,14 +1324,8 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, #endif /* CONFIG_INET */ enum bpf_text_poke_type { - /* All call-related pokes. */ - BPF_MOD_NOP_TO_CALL, - BPF_MOD_CALL_TO_CALL, - BPF_MOD_CALL_TO_NOP, - /* All jump-related pokes. */ - BPF_MOD_NOP_TO_JUMP, - BPF_MOD_JUMP_TO_JUMP, - BPF_MOD_JUMP_TO_NOP, + BPF_MOD_CALL, + BPF_MOD_JUMP, }; int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, -- cgit v1.2.3 From bec170e55982c2d3b8e1beccadf16e288fe6fb5a Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 23 Nov 2019 17:28:37 +0100 Subject: net: phy: add helpers phy_(un)lock_mdio_bus Add helpers to make locking/unlocking the MDIO bus easier. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- include/linux/phy.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index f5cdfb206097..5032d453ac66 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1076,6 +1076,16 @@ static inline const char *phydev_name(const struct phy_device *phydev) return dev_name(&phydev->mdio.dev); } +static inline void phy_lock_mdio_bus(struct phy_device *phydev) +{ + mutex_lock(&phydev->mdio.bus->mdio_lock); +} + +static inline void phy_unlock_mdio_bus(struct phy_device *phydev) +{ + mutex_unlock(&phydev->mdio.bus->mdio_lock); +} + void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) __printf(2, 3); void phy_attached_info(struct phy_device *phydev); -- cgit v1.2.3